From 3fa3709987bd2bfd34ff52ed522425a1deb07b26 Mon Sep 17 00:00:00 2001 From: Brian Ward Date: Mon, 10 Apr 2023 11:30:09 -0400 Subject: [PATCH 01/12] Standalone log_prob output renamed --- cmdstanpy/model.py | 2 +- test/test_log_prob.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmdstanpy/model.py b/cmdstanpy/model.py index a4faf633..54faa22a 100644 --- a/cmdstanpy/model.py +++ b/cmdstanpy/model.py @@ -1625,7 +1625,7 @@ def log_prob( either as a dictionary with entries matching the data variables, or as the path of a data file in JSON or Rdump format. - :return: A pandas.DataFrame containing columns "lp_" and additional + :return: A pandas.DataFrame containing columns "lp__" and additional columns for the gradient values. These gradients will be for the unconstrained parameters of the model. """ diff --git a/test/test_log_prob.py b/test/test_log_prob.py index c468ceed..1a4e6afd 100644 --- a/test/test_log_prob.py +++ b/test/test_log_prob.py @@ -22,7 +22,7 @@ def test_lp_good() -> None: model = CmdStanModel(stan_file=BERN_STAN) x = model.log_prob({"theta": 0.1}, data=BERN_DATA) - assert "lp_" in x.columns + assert "lp__" in x.columns[0] def test_lp_bad( From 797e24b9d2e946ea5b71dc9bedd0b462aafc44e0 Mon Sep 17 00:00:00 2001 From: Brian Ward Date: Tue, 25 Apr 2023 15:41:40 -0400 Subject: [PATCH 02/12] Start laplace work --- cmdstanpy/cmdstan_args.py | 40 ++++++++++++++++++++++++++++++++--- cmdstanpy/model.py | 8 +++++++ cmdstanpy/stanfit/__init__.py | 1 + 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/cmdstanpy/cmdstan_args.py b/cmdstanpy/cmdstan_args.py index 49b96a20..9bb1c3c6 100644 --- a/cmdstanpy/cmdstan_args.py +++ b/cmdstanpy/cmdstan_args.py @@ -29,6 +29,7 @@ class Method(Enum): OPTIMIZE = auto() GENERATE_QUANTITIES = auto() VARIATIONAL = auto() + LAPLACE = auto() def __repr__(self) -> str: return '<%s.%s>' % (self.__class__.__name__, self.name) @@ -398,6 +399,7 @@ def __init__( tol_rel_grad: Optional[float] = None, tol_param: Optional[float] = None, history_size: Optional[int] = None, + jacobian: bool = False, ) -> None: self.algorithm = algorithm or "" @@ -410,6 +412,7 @@ def __init__( self.tol_rel_grad = tol_rel_grad self.tol_param = tol_param self.history_size = history_size + self.jacobian = jacobian self.thin = None def validate( @@ -511,8 +514,7 @@ def validate( else: raise ValueError('history_size must be type of int') - # pylint: disable=unused-argument - def compose(self, idx: int, cmd: List[str]) -> List[str]: + def compose(self, _idx: int, cmd: List[str]) -> List[str]: """compose command string for CmdStan for non-default arg values.""" cmd.append('method=optimize') if self.algorithm: @@ -535,7 +537,37 @@ def compose(self, idx: int, cmd: List[str]) -> List[str]: cmd.append('iter={}'.format(self.iter)) if self.save_iterations: cmd.append('save_iterations=1') + if self.jacobian: + cmd.append("jacobian=1") + return cmd + + +class LaplaceArgs: + """Arguments needed for laplace method.""" + def __init__( + self, mode: str, draws: Optional[int] = None, jacobian: bool = True + ) -> None: + self.mode = mode + self.jacobian = jacobian + self.draws = draws + + def validate(self) -> None: + """Check arguments correctness and consistency.""" + if not os.path.exists(self.mode): + raise ValueError(f'Invalid path for mode file: {self.mode}') + if self.draws is not None: + if not isinstance(self.draws, (int, np.integer)) or self.draws <= 0: + raise ValueError('draws must be a positive integer') + + def compose(self, _idx: int, cmd: List[str]) -> List[str]: + """compose command string for CmdStan for non-default arg values.""" + cmd.append('method=laplace') + cmd.append(f'mode={self.mode}') + if self.draws: + cmd.append(f'draws={self.draws}') + if not self.jacobian: + cmd.append("jacobian=0") return cmd @@ -753,6 +785,8 @@ def __init__( self.method = Method.GENERATE_QUANTITIES elif isinstance(method_args, VariationalArgs): self.method = Method.VARIATIONAL + elif isinstance(method_args, LaplaceArgs): + self.method = Method.LAPLACE self.method_args.validate(len(chain_ids) if chain_ids else None) self.validate() @@ -913,7 +947,7 @@ def compose_command( *, diagnostic_file: Optional[str] = None, profile_file: Optional[str] = None, - num_chains: Optional[int] = None + num_chains: Optional[int] = None, ) -> List[str]: """ Compose CmdStan command for non-default arguments. diff --git a/cmdstanpy/model.py b/cmdstanpy/model.py index 54faa22a..269cc617 100644 --- a/cmdstanpy/model.py +++ b/cmdstanpy/model.py @@ -588,6 +588,8 @@ def optimize( refresh: Optional[int] = None, time_fmt: str = "%Y%m%d%H%M%S", timeout: Optional[float] = None, + jacobian: bool = False, + # would be nice to move this further up, but that's a breaking change ) -> CmdStanMLE: """ Run the specified CmdStan optimize algorithm to produce a @@ -689,6 +691,11 @@ def optimize( :param timeout: Duration at which optimization times out in seconds. + :param jacobian: Whether or not to use the Jacobian adjustment for + constrained variables in optimization. By default this is false, + meaning optimization yields the Maximum Likehood Estimate (MLE). + Setting it to true yields the Maximum A Posteriori Estimate (MAP). + :return: CmdStanMLE object """ optimize_args = OptimizeArgs( @@ -702,6 +709,7 @@ def optimize( history_size=history_size, iter=iter, save_iterations=save_iterations, + jacobian=jacobian, ) with MaybeDictToFilePath(data, inits) as (_data, _inits): diff --git a/cmdstanpy/stanfit/__init__.py b/cmdstanpy/stanfit/__init__.py index b966974a..6c7fc8e2 100644 --- a/cmdstanpy/stanfit/__init__.py +++ b/cmdstanpy/stanfit/__init__.py @@ -170,6 +170,7 @@ def from_csv( optimize_args = OptimizeArgs( algorithm=config_dict['algorithm'], save_iterations=config_dict['save_iterations'], + jacobian=config_dict.get('jacobian', 0), ) cmdstan_args = CmdStanArgs( model_name=config_dict['model'], From 76e30e59f6e462af773a1f5f8b3eb72f66d5ea28 Mon Sep 17 00:00:00 2001 From: Brian Ward Date: Mon, 15 May 2023 12:07:23 -0400 Subject: [PATCH 03/12] Add jacobian flag to log_prob --- cmdstanpy/model.py | 6 ++++++ test/test_log_prob.py | 11 +++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/cmdstanpy/model.py b/cmdstanpy/model.py index 269cc617..cc105466 100644 --- a/cmdstanpy/model.py +++ b/cmdstanpy/model.py @@ -1613,6 +1613,8 @@ def log_prob( self, params: Union[Dict[str, Any], str, os.PathLike], data: Union[Mapping[str, Any], str, os.PathLike, None] = None, + *, + jacobian: bool = True, ) -> pd.DataFrame: """ Calculate the log probability and gradient at the given parameter @@ -1633,6 +1635,9 @@ def log_prob( either as a dictionary with entries matching the data variables, or as the path of a data file in JSON or Rdump format. + :param jacobian: Whether or not to enable the Jacobian adjustment + for constrained parameters. Defaults to ``True``. + :return: A pandas.DataFrame containing columns "lp__" and additional columns for the gradient values. These gradients will be for the unconstrained parameters of the model. @@ -1648,6 +1653,7 @@ def log_prob( str(self.exe_file), "log_prob", f"constrained_params={_params}", + f"jacobian={int(jacobian)}", ] if _data is not None: cmd += ["data", f"file={_data}"] diff --git a/test/test_log_prob.py b/test/test_log_prob.py index 1a4e6afd..1fda96ca 100644 --- a/test/test_log_prob.py +++ b/test/test_log_prob.py @@ -5,6 +5,7 @@ import re from test import check_present +import numpy as np import pytest from cmdstanpy.model import CmdStanModel @@ -21,8 +22,14 @@ def test_lp_good() -> None: model = CmdStanModel(stan_file=BERN_STAN) - x = model.log_prob({"theta": 0.1}, data=BERN_DATA) - assert "lp__" in x.columns[0] + out = model.log_prob({"theta": 0.1}, data=BERN_DATA) + assert "lp_" in out.columns[0] + + out_unadjusted = model.log_prob( + {"theta": 0.1}, data=BERN_DATA, jacobian=False + ) + assert "lp_" in out_unadjusted.columns[0] + assert not np.allclose(out.to_numpy(), out_unadjusted.to_numpy()) def test_lp_bad( From 962f1b36e7e3ae8e410dc33321bb1256d978bf14 Mon Sep 17 00:00:00 2001 From: Brian Ward Date: Mon, 15 May 2023 12:52:20 -0400 Subject: [PATCH 04/12] Factor out some of the stan_variable shared logic --- cmdstanpy/stanfit/gq.py | 22 +++++++++------------- cmdstanpy/stanfit/mcmc.py | 30 +++++++++++------------------- cmdstanpy/utils/data_munging.py | 27 +++++++++++++++++++++++++++ 3 files changed, 47 insertions(+), 32 deletions(-) diff --git a/cmdstanpy/stanfit/gq.py b/cmdstanpy/stanfit/gq.py index 5555ad7e..a1626efe 100644 --- a/cmdstanpy/stanfit/gq.py +++ b/cmdstanpy/stanfit/gq.py @@ -32,12 +32,12 @@ from cmdstanpy.cmdstan_args import Method from cmdstanpy.utils import ( - BaseType, build_xarray_data, flatten_chains, get_logger, scan_generated_quantities_csv, ) +from cmdstanpy.utils.data_munging import extract_reshape from .mcmc import CmdStanMCMC from .metadata import InferenceMetadata @@ -586,21 +586,17 @@ def stan_variable( # is gq variable self._assemble_generated_quantities() - draw1, num_draws = self._draws_start(inc_warmup) - dims = [num_draws * self.chains] + dims = (num_draws * self.chains,) col_idxs = self._metadata.stan_vars_cols[var] - if len(col_idxs) > 0: - dims.extend(self._metadata.stan_vars_dims[var]) - draws = self._draws[draw1:, :, col_idxs] - - if self._metadata.stan_vars_types[var] == BaseType.COMPLEX: - draws = draws[..., ::2] + 1j * draws[..., 1::2] - dims = dims[:-1] - draws = draws.reshape(dims, order='F') - - return draws + return extract_reshape( + dims=dims + self._metadata.stan_vars_dims[var], + col_idxs=col_idxs, + var_type=self._metadata.stan_vars_types[var], + start_row=draw1, + draws_in=self._draws, + ) def stan_variables(self, inc_warmup: bool = False) -> Dict[str, np.ndarray]: """ diff --git a/cmdstanpy/stanfit/mcmc.py b/cmdstanpy/stanfit/mcmc.py index 8e0c9b58..d3e5a851 100644 --- a/cmdstanpy/stanfit/mcmc.py +++ b/cmdstanpy/stanfit/mcmc.py @@ -31,7 +31,6 @@ from cmdstanpy.cmdstan_args import Method, SamplerArgs from cmdstanpy.utils import ( EXTENSION, - BaseType, build_xarray_data, check_sampler_csv, cmdstan_path, @@ -41,6 +40,7 @@ flatten_chains, get_logger, ) +from cmdstanpy.utils.data_munging import extract_reshape from .metadata import InferenceMetadata from .runset import RunSet @@ -740,26 +740,18 @@ def stan_variable( 'Available variables are ' + ", ".join(self._metadata.stan_vars_dims) ) - self._assemble_draws() - draw1 = 0 - if not inc_warmup and self._save_warmup: - draw1 = self.num_draws_warmup - num_draws = self.num_draws_sampling - if inc_warmup and self._save_warmup: - num_draws += self.num_draws_warmup - dims = [num_draws * self.chains] - col_idxs = self._metadata.stan_vars_cols[var] - if len(col_idxs) > 0: - dims.extend(self._metadata.stan_vars_dims[var]) - draws = self._draws[draw1:, :, col_idxs] - - if self._metadata.stan_vars_types[var] == BaseType.COMPLEX: - draws = draws[..., ::2] + 1j * draws[..., 1::2] - dims = dims[:-1] - draws = draws.reshape(dims, order='F') + draws = self.draws(inc_warmup=inc_warmup) + dims = (draws.shape[0] * self.chains,) + col_idxs = self._metadata.stan_vars_cols[var] - return draws + return extract_reshape( + dims=dims + self._metadata.stan_vars_dims[var], + col_idxs=col_idxs, + var_type=self._metadata.stan_vars_types[var], + start_row=0, + draws_in=draws, + ) def stan_variables(self) -> Dict[str, np.ndarray]: """ diff --git a/cmdstanpy/utils/data_munging.py b/cmdstanpy/utils/data_munging.py index a9ee96dd..63d12cfd 100644 --- a/cmdstanpy/utils/data_munging.py +++ b/cmdstanpy/utils/data_munging.py @@ -63,3 +63,30 @@ def build_xarray_data( var_dims, np.squeeze(drawset[start_row:, :, col_idxs], axis=2), ) + + +def extract_reshape( + *, + dims: Tuple[int, ...], + col_idxs: Tuple[int, ...], + var_type: BaseType, + start_row: int, + draws_in: np.ndarray, +) -> np.ndarray: + """ + Helper to turn a draws table into a numpy array of draws. + Includes special handling for complex numbers. + """ + # TODO also use in MLE, VB + if dims: + draws = draws_in[start_row:, :, col_idxs] + + if var_type == BaseType.COMPLEX: + draws = draws[..., ::2] + 1j * draws[..., 1::2] + dims = dims[:-1] + + draws = draws.reshape(*dims, order="F") + + return draws + else: + return np.squeeze(draws_in[start_row:, :, col_idxs], axis=2) From be6587c6306c5c3880e8ec58e1cf7b0a8b8bb019 Mon Sep 17 00:00:00 2001 From: Brian Ward Date: Mon, 15 May 2023 14:55:30 -0400 Subject: [PATCH 05/12] Sketch of method, still need to write class --- cmdstanpy/cmdstan_args.py | 13 +++--- cmdstanpy/model.py | 84 ++++++++++++++++++++++++++++++++++- cmdstanpy/stanfit/__init__.py | 4 +- cmdstanpy/stanfit/laplace.py | 23 ++++++++++ 4 files changed, 116 insertions(+), 8 deletions(-) create mode 100644 cmdstanpy/stanfit/laplace.py diff --git a/cmdstanpy/cmdstan_args.py b/cmdstanpy/cmdstan_args.py index 9bb1c3c6..f6426817 100644 --- a/cmdstanpy/cmdstan_args.py +++ b/cmdstanpy/cmdstan_args.py @@ -401,7 +401,6 @@ def __init__( history_size: Optional[int] = None, jacobian: bool = False, ) -> None: - self.algorithm = algorithm or "" self.init_alpha = init_alpha self.iter = iter @@ -415,9 +414,7 @@ def __init__( self.jacobian = jacobian self.thin = None - def validate( - self, chains: Optional[int] = None # pylint: disable=unused-argument - ) -> None: + def validate(self, _chains: Optional[int] = None) -> None: """ Check arguments correctness and consistency. """ @@ -552,7 +549,7 @@ def __init__( self.jacobian = jacobian self.draws = draws - def validate(self) -> None: + def validate(self, _chains: Optional[int] = None) -> None: """Check arguments correctness and consistency.""" if not os.path.exists(self.mode): raise ValueError(f'Invalid path for mode file: {self.mode}') @@ -753,7 +750,11 @@ def __init__( model_exe: OptionalPath, chain_ids: Optional[List[int]], method_args: Union[ - SamplerArgs, OptimizeArgs, GenerateQuantitiesArgs, VariationalArgs + SamplerArgs, + OptimizeArgs, + GenerateQuantitiesArgs, + VariationalArgs, + LaplaceArgs, ], data: Union[Mapping[str, Any], str, None] = None, seed: Union[int, List[int], None] = None, diff --git a/cmdstanpy/model.py b/cmdstanpy/model.py index cc105466..7c4434ce 100644 --- a/cmdstanpy/model.py +++ b/cmdstanpy/model.py @@ -40,6 +40,8 @@ from cmdstanpy.cmdstan_args import ( CmdStanArgs, GenerateQuantitiesArgs, + LaplaceArgs, + Method, OptimizeArgs, SamplerArgs, VariationalArgs, @@ -47,6 +49,7 @@ from cmdstanpy.compiler_opts import CompilerOptions from cmdstanpy.stanfit import ( CmdStanGQ, + CmdStanLaplace, CmdStanMCMC, CmdStanMLE, CmdStanVB, @@ -393,7 +396,7 @@ def format( + '.bak-' + datetime.now().strftime("%Y%m%d%H%M%S"), ) - with (open(self.stan_file, 'w')) as file_handle: + with open(self.stan_file, 'w') as file_handle: file_handle.write(result) else: print(result) @@ -1682,6 +1685,85 @@ def log_prob( result = pd.read_csv(output, comment="#") return result + def laplace_sample( + self, + data: Union[Mapping[str, Any], str, os.PathLike, None] = None, + mode: Union[CmdStanMLE, str, os.PathLike, None] = None, + draws: Optional[int] = None, + *, + jacobian: bool = True, # NB: Different than optimize! + seed: Optional[int] = None, + output_dir: OptionalPath = None, + sig_figs: Optional[int] = None, + save_profile: bool = False, + show_console: bool = False, + refresh: Optional[int] = None, + time_fmt: str = "%Y%m%d%H%M%S", + timeout: Optional[float] = None, + opt_args: Dict[str, Any] = {}, + ) -> CmdStanLaplace: + if mode is None: + optimize_args = { + "seed": seed, + "sig_figs": sig_figs, + "jacobian": jacobian, + "save_profile": save_profile, + "show_console": show_console, + "refresh": refresh, + "time_fmt": time_fmt, + "timeout": timeout, + "output_dir": output_dir, + } + optimize_args.update(opt_args) + try: + cmdstan_mode: CmdStanMLE = self.optimize( + data=data, + **optimize_args, # type: ignore + ) + except Exception as e: + raise RuntimeError( + "Failed to run optimizer on model. " + "Consider supplying a mode or additional optimizer args" + ) from e + elif not isinstance(mode, CmdStanMLE): + cmdstan_mode = from_csv(mode) # type: ignore # we check below + if cmdstan_mode.runset.method != Method.OPTIMIZE: + raise ValueError( + "Mode must be a CmdStanMLE or a path to an optimize CSV" + ) + else: + cmdstan_mode = mode + + # TODO: jacobian warnings on mismatch + + laplace_args = LaplaceArgs( + cmdstan_mode.runset.csv_files[0], draws, jacobian + ) + + with MaybeDictToFilePath(data) as (_data,): + args = CmdStanArgs( + self._name, + self._exe_file, + chain_ids=None, + data=_data, + seed=seed, + output_dir=output_dir, + sig_figs=sig_figs, + save_profile=save_profile, + method_args=laplace_args, + refresh=refresh, + ) + dummy_chain_id = 0 + runset = RunSet(args=args, chains=1, time_fmt=time_fmt) + self._run_cmdstan( + runset, + dummy_chain_id, + show_console=show_console, + timeout=timeout, + ) + runset.raise_for_timeouts() + return CmdStanLaplace(runset, cmdstan_mode) + def _run_cmdstan( self, runset: RunSet, diff --git a/cmdstanpy/stanfit/__init__.py b/cmdstanpy/stanfit/__init__.py index 6c7fc8e2..6a7dd7ca 100644 --- a/cmdstanpy/stanfit/__init__.py +++ b/cmdstanpy/stanfit/__init__.py @@ -13,6 +13,7 @@ from cmdstanpy.utils import check_sampler_csv, get_logger, scan_config from .gq import CmdStanGQ +from .laplace import CmdStanLaplace from .mcmc import CmdStanMCMC from .metadata import InferenceMetadata from .mle import CmdStanMLE @@ -26,6 +27,7 @@ "CmdStanMLE", "CmdStanVB", "CmdStanGQ", + "CmdStanLaplace", ] @@ -143,7 +145,7 @@ def from_csv( save_warmup=config_dict['save_warmup'], fixed_param=True, ) - except (ValueError) as e: + except ValueError as e: raise ValueError( 'Invalid or corrupt Stan CSV output file, ' ) from e diff --git a/cmdstanpy/stanfit/laplace.py b/cmdstanpy/stanfit/laplace.py new file mode 100644 index 00000000..ce4117e7 --- /dev/null +++ b/cmdstanpy/stanfit/laplace.py @@ -0,0 +1,23 @@ +""" + Container for the result of running a laplace approximation. +""" + +from cmdstanpy.cmdstan_args import Method + +# from cmdstanpy.utils.data_munging import extract_reshape +# from cmdstanpy.cmdstan_args import LaplaceArgs, +# from .metadata import InferenceMetadata +from .mle import CmdStanMLE +from .runset import RunSet + + +class CmdStanLaplace: + def __init__(self, runset: RunSet, mode: CmdStanMLE) -> None: + """Initialize object.""" + if not runset.method == Method.LAPLACE: + raise ValueError( + 'Wrong runset method, expecting laplace runset, ' + 'found method {}'.format(runset.method) + ) + self.runset = runset + self.mode = mode From de9f7c3abb8132452e925271e16cf40e3ed913d2 Mon Sep 17 00:00:00 2001 From: Brian Ward Date: Mon, 15 May 2023 16:38:54 -0400 Subject: [PATCH 06/12] Basic stan_variable function --- cmdstanpy/model.py | 5 +-- cmdstanpy/stanfit/laplace.py | 58 +++++++++++++++++++++++++++++++-- cmdstanpy/utils/data_munging.py | 2 +- cmdstanpy/utils/stancsv.py | 10 ++++++ 4 files changed, 70 insertions(+), 5 deletions(-) diff --git a/cmdstanpy/model.py b/cmdstanpy/model.py index 7c4434ce..20150e5e 100644 --- a/cmdstanpy/model.py +++ b/cmdstanpy/model.py @@ -1700,7 +1700,7 @@ def laplace_sample( refresh: Optional[int] = None, time_fmt: str = "%Y%m%d%H%M%S", timeout: Optional[float] = None, - opt_args: Dict[str, Any] = {}, + opt_args: Optional[Dict[str, Any]] = None, ) -> CmdStanLaplace: if mode is None: optimize_args = { @@ -1714,7 +1714,8 @@ def laplace_sample( "timeout": timeout, "output_dir": output_dir, } - optimize_args.update(opt_args) + optimize_args.update(opt_args or {}) + optimize_args['time_fmt'] = 'opt-' + time_fmt try: cmdstan_mode: CmdStanMLE = self.optimize( data=data, diff --git a/cmdstanpy/stanfit/laplace.py b/cmdstanpy/stanfit/laplace.py index ce4117e7..a201c853 100644 --- a/cmdstanpy/stanfit/laplace.py +++ b/cmdstanpy/stanfit/laplace.py @@ -2,11 +2,16 @@ Container for the result of running a laplace approximation. """ +from typing import Optional + +import numpy as np + from cmdstanpy.cmdstan_args import Method +from cmdstanpy.utils.data_munging import extract_reshape +from cmdstanpy.utils.stancsv import scan_laplace_csv -# from cmdstanpy.utils.data_munging import extract_reshape # from cmdstanpy.cmdstan_args import LaplaceArgs, -# from .metadata import InferenceMetadata +from .metadata import InferenceMetadata from .mle import CmdStanMLE from .runset import RunSet @@ -21,3 +26,52 @@ def __init__(self, runset: RunSet, mode: CmdStanMLE) -> None: ) self.runset = runset self.mode = mode + + self._draws: np.ndarray = np.array(()) + + config = scan_laplace_csv(runset.csv_files[0]) + self._metadata = InferenceMetadata(config) + + def _assemble_draws(self) -> None: + if self._draws.shape != (0,): + return + + with open(self.runset.csv_files[0], 'r') as fd: + lines = (line for line in fd if not line.startswith('#')) + self._draws = np.loadtxt( + lines, + dtype=float, + ndmin=2, + skiprows=1, + delimiter=',', + ) + + def stan_variable(self, var: str) -> np.ndarray: + + self._assemble_draws() + draws = self._draws + dims = (draws.shape[0],) + col_idxs = self._metadata.stan_vars_cols[var] + return extract_reshape( + dims=dims + self._metadata.stan_vars_dims[var], + col_idxs=col_idxs, + var_type=self._metadata.stan_vars_types[var], + start_row=0, + draws_in=draws, + ) + + def save_csvfiles(self, dir: Optional[str] = None) -> None: + """ + Move output CSV files to specified directory. If files were + written to the temporary session directory, clean filename. + E.g., save 'bernoulli-201912081451-1-5nm6as7u.csv' as + 'bernoulli-201912081451-1.csv'. + + :param dir: directory path + + See Also + -------- + stanfit.RunSet.save_csvfiles + cmdstanpy.from_csv + """ + self.runset.save_csvfiles(dir) diff --git a/cmdstanpy/utils/data_munging.py b/cmdstanpy/utils/data_munging.py index 63d12cfd..d0252065 100644 --- a/cmdstanpy/utils/data_munging.py +++ b/cmdstanpy/utils/data_munging.py @@ -79,7 +79,7 @@ def extract_reshape( """ # TODO also use in MLE, VB if dims: - draws = draws_in[start_row:, :, col_idxs] + draws = draws_in[start_row:, ..., col_idxs] if var_type == BaseType.COMPLEX: draws = draws[..., ::2] + 1j * draws[..., 1::2] diff --git a/cmdstanpy/utils/stancsv.py b/cmdstanpy/utils/stancsv.py index efa9efaa..c92d45b9 100644 --- a/cmdstanpy/utils/stancsv.py +++ b/cmdstanpy/utils/stancsv.py @@ -142,6 +142,16 @@ def scan_optimize_csv(path: str, save_iters: bool = False) -> Dict[str, Any]: return dict +def scan_laplace_csv(path: str) -> Dict[str, Any]: + """Process laplace stan_csv output file line by line.""" + dict: Dict[str, Any] = {} + lineno = 0 + with open(path, 'r') as fd: + lineno = scan_config(fd, dict, lineno) + lineno = scan_column_names(fd, dict, lineno) + return dict + + def scan_generated_quantities_csv(path: str) -> Dict[str, Any]: """ Process standalone generated quantities stan_csv output file line by line. From f0445a0d2a5d7f405b125c328509c88eb162fc7e Mon Sep 17 00:00:00 2001 From: Brian Ward Date: Tue, 16 May 2023 11:30:57 -0400 Subject: [PATCH 07/12] Checkpointing --- cmdstanpy/stanfit/laplace.py | 81 +++++++++++++++++++++++++++++++++--- 1 file changed, 75 insertions(+), 6 deletions(-) diff --git a/cmdstanpy/stanfit/laplace.py b/cmdstanpy/stanfit/laplace.py index a201c853..08ad2070 100644 --- a/cmdstanpy/stanfit/laplace.py +++ b/cmdstanpy/stanfit/laplace.py @@ -2,7 +2,7 @@ Container for the result of running a laplace approximation. """ -from typing import Optional +from typing import Dict, Optional import numpy as np @@ -10,11 +10,16 @@ from cmdstanpy.utils.data_munging import extract_reshape from cmdstanpy.utils.stancsv import scan_laplace_csv -# from cmdstanpy.cmdstan_args import LaplaceArgs, from .metadata import InferenceMetadata from .mle import CmdStanMLE from .runset import RunSet +# TODO list: +# - remaining methods +# - tests +# - docs and example notebook +# - make sure features like standalone GQ are updated/working + class CmdStanLaplace: def __init__(self, runset: RunSet, mode: CmdStanMLE) -> None: @@ -24,8 +29,8 @@ def __init__(self, runset: RunSet, mode: CmdStanMLE) -> None: 'Wrong runset method, expecting laplace runset, ' 'found method {}'.format(runset.method) ) - self.runset = runset - self.mode = mode + self._runset = runset + self._mode = mode self._draws: np.ndarray = np.array(()) @@ -36,7 +41,8 @@ def _assemble_draws(self) -> None: if self._draws.shape != (0,): return - with open(self.runset.csv_files[0], 'r') as fd: + # TODO: should we fake a chain dimension? + with open(self._runset.csv_files[0], 'r') as fd: lines = (line for line in fd if not line.startswith('#')) self._draws = np.loadtxt( lines, @@ -47,7 +53,23 @@ def _assemble_draws(self) -> None: ) def stan_variable(self, var: str) -> np.ndarray: + """ + Return a numpy.ndarray which contains the estimates for the + for the named Stan program variable where the dimensions of the + numpy.ndarray match the shape of the Stan program variable. + + This functionaltiy is also available via a shortcut using ``.`` - + writing ``fit.a`` is a synonym for ``fit.stan_variable("a")`` + :param var: variable name + + See Also + -------- + CmdStanMLE.stan_variables + CmdStanMCMC.stan_variable + CmdStanVB.stan_variable + CmdStanGQ.stan_variable + """ self._assemble_draws() draws = self._draws dims = (draws.shape[0],) @@ -60,6 +82,53 @@ def stan_variable(self, var: str) -> np.ndarray: draws_in=draws, ) + def stan_variables(self) -> Dict[str, np.ndarray]: + """ + Return a dictionary mapping Stan program variables names + to the corresponding numpy.ndarray containing the inferred values. + + :param inc_warmup: When ``True`` and the warmup draws are present in + the MCMC sample, then the warmup draws are included. + Default value is ``False`` + + See Also + -------- + CmdStanGQ.stan_variable + CmdStanMCMC.stan_variables + CmdStanMLE.stan_variables + CmdStanVB.stan_variables + """ + result = {} + for name in self._metadata.stan_vars_dims.keys(): + result[name] = self.stan_variable(name) + return result + + def method_variables(self) -> Dict[str, np.ndarray]: + """ + Returns a dictionary of all sampler variables, i.e., all + output column names ending in `__`. Assumes that all variables + are scalar variables where column name is variable name. + Maps each column name to a numpy.ndarray (draws x chains x 1) + containing per-draw diagnostic values. + """ + result = {} + self._assemble_draws() + for name, idxs in self._metadata.method_vars_cols.items(): + result[name] = self._draws[..., idxs[0]] + return result + + # def draws + # def draws_pd + # def draws_xr + + @property + def mode(self) -> CmdStanMLE: + """ + Return the maximum a posteriori estimate (mode) + as a :class:`CmdStanMLE` object. + """ + return self._mode + def save_csvfiles(self, dir: Optional[str] = None) -> None: """ Move output CSV files to specified directory. If files were @@ -74,4 +143,4 @@ def save_csvfiles(self, dir: Optional[str] = None) -> None: stanfit.RunSet.save_csvfiles cmdstanpy.from_csv """ - self.runset.save_csvfiles(dir) + self._runset.save_csvfiles(dir) From a69b34add041ba115c6da4e681ed5de3315b6cf5 Mon Sep 17 00:00:00 2001 From: Brian Ward Date: Wed, 14 Jun 2023 16:29:30 -0400 Subject: [PATCH 08/12] Version checks --- cmdstanpy/model.py | 12 ++++++++++++ test/test_laplace.py | 0 2 files changed, 12 insertions(+) create mode 100644 test/test_laplace.py diff --git a/cmdstanpy/model.py b/cmdstanpy/model.py index 20150e5e..c011f56e 100644 --- a/cmdstanpy/model.py +++ b/cmdstanpy/model.py @@ -715,6 +715,12 @@ def optimize( jacobian=jacobian, ) + if jacobian and cmdstan_version_before(2, 32, self.exe_info()): + raise ValueError( + "Jacobian adjustment for optimization is only supported " + "in CmdStan 2.32 and above." + ) + with MaybeDictToFilePath(data, inits) as (_data, _inits): args = CmdStanArgs( self._name, @@ -1702,6 +1708,12 @@ def laplace_sample( timeout: Optional[float] = None, opt_args: Optional[Dict[str, Any]] = None, ) -> CmdStanLaplace: + + if cmdstan_version_before(2, 32, self.exe_info()): + raise ValueError( + "Method 'laplace_sample' not available for CmdStan versions " + "before 2.32" + ) if mode is None: optimize_args = { "seed": seed, diff --git a/test/test_laplace.py b/test/test_laplace.py new file mode 100644 index 00000000..e69de29b From 81e1828852f49ef6f582b25a9eafd2dd9955ec0d Mon Sep 17 00:00:00 2001 From: Brian Ward Date: Mon, 10 Jul 2023 12:21:58 -0400 Subject: [PATCH 09/12] Start tests --- test/test_laplace.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/test/test_laplace.py b/test/test_laplace.py index e69de29b..d9e1d902 100644 --- a/test/test_laplace.py +++ b/test/test_laplace.py @@ -0,0 +1,30 @@ +"""Tests for the Laplace sampling method.""" + +import os + +import cmdstanpy + +HERE = os.path.dirname(os.path.abspath(__file__)) +DATAFILES_PATH = os.path.join(HERE, 'data') + + +def test_laplace_from_csv(): + model_file = os.path.join(DATAFILES_PATH, 'optimize', 'rosenbrock.stan') + model = cmdstanpy.CmdStanModel(stan_file=model_file) + fit = model.laplace_sample( + data={}, + mode=os.path.join(DATAFILES_PATH, 'optimize', 'rosenbrock_mle.csv'), + ) + assert 'x' in fit.stan_variables() + assert 'y' in fit.stan_variables() + assert isinstance(fit.mode, cmdstanpy.CmdStanMLE) + + +def test_laplace_runs_opt(): + model_file = os.path.join(DATAFILES_PATH, 'optimize', 'rosenbrock.stan') + model = cmdstanpy.CmdStanModel(stan_file=model_file) + fit1 = model.laplace_sample(data={}, seed=1234) + assert isinstance(fit1.mode, cmdstanpy.CmdStanMLE) + + assert fit1.mode.metadata.cmdstan_config['seed'] == 1234 + assert fit1._metadata.cmdstan_config['seed'] == 1234 From 871c132b45c8fd4097e8a96de482117e36008d85 Mon Sep 17 00:00:00 2001 From: Brian Ward Date: Tue, 11 Jul 2023 14:27:00 -0400 Subject: [PATCH 10/12] Add missing methods --- cmdstanpy/stanfit/laplace.py | 144 +++++++++++++++++++++++++++++++++-- 1 file changed, 138 insertions(+), 6 deletions(-) diff --git a/cmdstanpy/stanfit/laplace.py b/cmdstanpy/stanfit/laplace.py index 08ad2070..64d018df 100644 --- a/cmdstanpy/stanfit/laplace.py +++ b/cmdstanpy/stanfit/laplace.py @@ -2,12 +2,29 @@ Container for the result of running a laplace approximation. """ -from typing import Dict, Optional +from typing import ( + Any, + Dict, + Hashable, + List, + MutableMapping, + Optional, + Tuple, + Union, +) import numpy as np +import pandas as pd + +try: + import xarray as xr + + XARRAY_INSTALLED = True +except ImportError: + XARRAY_INSTALLED = False from cmdstanpy.cmdstan_args import Method -from cmdstanpy.utils.data_munging import extract_reshape +from cmdstanpy.utils.data_munging import build_xarray_data, extract_reshape from cmdstanpy.utils.stancsv import scan_laplace_csv from .metadata import InferenceMetadata @@ -15,7 +32,6 @@ from .runset import RunSet # TODO list: -# - remaining methods # - tests # - docs and example notebook # - make sure features like standalone GQ are updated/working @@ -117,9 +133,99 @@ def method_variables(self) -> Dict[str, np.ndarray]: result[name] = self._draws[..., idxs[0]] return result - # def draws - # def draws_pd - # def draws_xr + def draws(self) -> np.ndarray: + """ + Return a numpy.ndarray containing the draws from the + approximate posterior distribution. This is a 2-D array + of shape (draws, parameters). + """ + self._assemble_draws() + return self._draws + + def draws_pd( + self, + vars: Union[List[str], str, None] = None, + ) -> pd.DataFrame: + if vars is not None: + if isinstance(vars, str): + vars_list = [vars] + else: + vars_list = vars + + self._assemble_draws() + cols = [] + if vars is not None: + for var in dict.fromkeys(vars_list): + if ( + var not in self._metadata.method_vars_cols + and var not in self._metadata.stan_vars_cols + ): + raise ValueError('Unknown variable: {}'.format(var)) + if var in self._metadata.method_vars_cols: + cols.append(var) + else: + for idx in self._metadata.stan_vars_cols[var]: + cols.append(self.column_names[idx]) + else: + cols = list(self.column_names) + + return pd.DataFrame(self._draws, columns=self.column_names)[cols] + + def draws_xr( + self, + vars: Union[str, List[str], None] = None, + ) -> "xr.Dataset": + """ + Returns the sampler draws as a xarray Dataset. + + :param vars: optional list of variable names. + + See Also + -------- + CmdStanMCMC.draws_xr + CmdStanGQ.draws_xr + """ + if not XARRAY_INSTALLED: + raise RuntimeError( + 'Package "xarray" is not installed, cannot produce draws array.' + ) + + if vars is None: + vars_list = list(self._metadata.stan_vars_cols.keys()) + elif isinstance(vars, str): + vars_list = [vars] + else: + vars_list = vars + + self._assemble_draws() + + meta = self._metadata.cmdstan_config + attrs: MutableMapping[Hashable, Any] = { + "stan_version": f"{meta['stan_version_major']}." + f"{meta['stan_version_minor']}.{meta['stan_version_patch']}", + "model": meta["model"], + } + + data: MutableMapping[Hashable, Any] = {} + coordinates: MutableMapping[Hashable, Any] = { + "draw": np.arange(self._draws.shape[0]), + } + + for var in vars_list: + build_xarray_data( + data, + var, + self._metadata.stan_vars_dims[var], + self._metadata.stan_vars_cols[var], + 0, + self._draws[:, np.newaxis, :], + self._metadata.stan_vars_types[var], + ) + return ( + xr.Dataset(data, coords=coordinates, attrs=attrs) + .transpose('draw', ...) + .squeeze() + ) @property def mode(self) -> CmdStanMLE: @@ -129,6 +235,32 @@ def mode(self) -> CmdStanMLE: """ return self._mode + def __repr__(self) -> str: + mode = '\n'.join( + ['\t' + line for line in repr(self.mode).splitlines()] + )[1:] + rep = 'CmdStanLaplace: model={} \nmode=({})\n{}'.format( + self._runset.model, + mode, + self._runset._args.method_args.compose(0, cmd=[]), + ) + rep = '{}\n csv_files:\n\t{}\n output_files:\n\t{}'.format( + rep, + '\n\t'.join(self._runset.csv_files), + '\n\t'.join(self._runset.stdout_files), + ) + return rep + + @property + def column_names(self) -> Tuple[str, ...]: + """ + Names of all outputs from the sampler, comprising sampler parameters + and all components of all model parameters, transformed parameters, + and quantities of interest. Corresponds to Stan CSV file header row, + with names munged to array notation, e.g. `beta[1]` not `beta.1`. + """ + return self._metadata.cmdstan_config['column_names'] # type: ignore + def save_csvfiles(self, dir: Optional[str] = None) -> None: """ Move output CSV files to specified directory. If files were From 2a382aaf0053a8c635a671ed6dbcd932f8ddd99e Mon Sep 17 00:00:00 2001 From: Brian Ward Date: Tue, 25 Jul 2023 12:22:30 -0400 Subject: [PATCH 11/12] Basic tests --- cmdstanpy/model.py | 20 ++++++++++++++------ cmdstanpy/stanfit/laplace.py | 18 ++++++++++++++++++ test/test_laplace.py | 36 +++++++++++++++++++++++++++++++++++- 3 files changed, 67 insertions(+), 7 deletions(-) diff --git a/cmdstanpy/model.py b/cmdstanpy/model.py index f0e5e220..9073104d 100644 --- a/cmdstanpy/model.py +++ b/cmdstanpy/model.py @@ -1709,7 +1709,6 @@ def laplace_sample( timeout: Optional[float] = None, opt_args: Optional[Dict[str, Any]] = None, ) -> CmdStanLaplace: - if cmdstan_version_before(2, 32, self.exe_info()): raise ValueError( "Method 'laplace_sample' not available for CmdStan versions " @@ -1741,14 +1740,23 @@ def laplace_sample( ) from e elif not isinstance(mode, CmdStanMLE): cmdstan_mode = from_csv(mode) # type: ignore # we check below - if cmdstan_mode.runset.method != Method.OPTIMIZE: - raise ValueError( - "Mode must be a CmdStanMLE or a path to an optimize CSV" - ) else: cmdstan_mode = mode - # TODO: jacobian warnings on mismatch + if cmdstan_mode.runset.method != Method.OPTIMIZE: + raise ValueError( + "Mode must be a CmdStanMLE or a path to an optimize CSV" + ) + + mode_jacobian = ( + cmdstan_mode.runset._args.method_args.jacobian # type: ignore + ) + if mode_jacobian != jacobian: + raise ValueError( + "Jacobian argument to optimize and laplace must match!\n" + f"Laplace was run with jacobian={jacobian},\n" + f"but optimize was run with jacobian={mode_jacobian}" + ) laplace_args = LaplaceArgs( cmdstan_mode.runset.csv_files[0], draws, jacobian diff --git a/cmdstanpy/stanfit/laplace.py b/cmdstanpy/stanfit/laplace.py index 64d018df..f3da2af5 100644 --- a/cmdstanpy/stanfit/laplace.py +++ b/cmdstanpy/stanfit/laplace.py @@ -251,6 +251,24 @@ def __repr__(self) -> str: ) return rep + def __getattr__(self, attr: str) -> np.ndarray: + """Synonymous with ``fit.stan_variable(attr)""" + if attr.startswith("_"): + raise AttributeError(f"Unknown variable name {attr}") + try: + return self.stan_variable(attr) + except ValueError as e: + # pylint: disable=raise-missing-from + raise AttributeError(*e.args) + + def __getstate__(self) -> dict: + # This function returns the mapping of objects to serialize with pickle. + # See https://docs.python.org/3/library/pickle.html#object.__getstate__ + # for details. We call _assemble_draws to ensure posterior samples have + # been loaded prior to serialization. + self._assemble_draws() + return self.__dict__ + @property def column_names(self) -> Tuple[str, ...]: """ diff --git a/test/test_laplace.py b/test/test_laplace.py index d9e1d902..eca0ce34 100644 --- a/test/test_laplace.py +++ b/test/test_laplace.py @@ -2,6 +2,9 @@ import os +import numpy as np +import pytest + import cmdstanpy HERE = os.path.dirname(os.path.abspath(__file__)) @@ -14,6 +17,7 @@ def test_laplace_from_csv(): fit = model.laplace_sample( data={}, mode=os.path.join(DATAFILES_PATH, 'optimize', 'rosenbrock_mle.csv'), + jacobian=False, ) assert 'x' in fit.stan_variables() assert 'y' in fit.stan_variables() @@ -23,8 +27,38 @@ def test_laplace_from_csv(): def test_laplace_runs_opt(): model_file = os.path.join(DATAFILES_PATH, 'optimize', 'rosenbrock.stan') model = cmdstanpy.CmdStanModel(stan_file=model_file) - fit1 = model.laplace_sample(data={}, seed=1234) + fit1 = model.laplace_sample(data={}, seed=1234, opt_args={'iter': 1003}) assert isinstance(fit1.mode, cmdstanpy.CmdStanMLE) assert fit1.mode.metadata.cmdstan_config['seed'] == 1234 assert fit1._metadata.cmdstan_config['seed'] == 1234 + assert fit1.mode.metadata.cmdstan_config['iter'] == 1003 + + +def test_laplace_bad_jacobian_mismatch(): + model_file = os.path.join(DATAFILES_PATH, 'optimize', 'rosenbrock.stan') + model = cmdstanpy.CmdStanModel(stan_file=model_file) + with pytest.raises(ValueError): + model.laplace_sample( + data={}, + mode=os.path.join(DATAFILES_PATH, 'optimize', 'rosenbrock_mle.csv'), + jacobian=True, + ) + + +def test_laplace_outputs(): + model_file = os.path.join(DATAFILES_PATH, 'optimize', 'rosenbrock.stan') + model = cmdstanpy.CmdStanModel(stan_file=model_file) + fit = model.laplace_sample(data={}, seed=1234, draws=123) + + variables = fit.stan_variables() + assert 'x' in variables + assert 'y' in variables + assert variables['x'].shape == (123,) + + np.testing.assert_array_equal(variables['x'], fit.x) + + fit_pd = fit.draws_pd() + assert 'x' in fit_pd.columns + assert 'y' in fit_pd.columns + assert fit_pd['x'].shape == (123,) From 3514acf05cb61319f88703f79dc46b6619e9871a Mon Sep 17 00:00:00 2001 From: Brian Ward Date: Tue, 25 Jul 2023 12:28:08 -0400 Subject: [PATCH 12/12] More input validation --- cmdstanpy/model.py | 4 ++++ cmdstanpy/stanfit/laplace.py | 2 -- test/test_laplace.py | 12 ++++++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/cmdstanpy/model.py b/cmdstanpy/model.py index 9073104d..7fd9d33f 100644 --- a/cmdstanpy/model.py +++ b/cmdstanpy/model.py @@ -1714,6 +1714,10 @@ def laplace_sample( "Method 'laplace_sample' not available for CmdStan versions " "before 2.32" ) + if opt_args is not None and mode is not None: + raise ValueError( + "Cannot specify both 'opt_args' and 'mode' arguments" + ) if mode is None: optimize_args = { "seed": seed, diff --git a/cmdstanpy/stanfit/laplace.py b/cmdstanpy/stanfit/laplace.py index f3da2af5..caa535e7 100644 --- a/cmdstanpy/stanfit/laplace.py +++ b/cmdstanpy/stanfit/laplace.py @@ -32,7 +32,6 @@ from .runset import RunSet # TODO list: -# - tests # - docs and example notebook # - make sure features like standalone GQ are updated/working @@ -57,7 +56,6 @@ def _assemble_draws(self) -> None: if self._draws.shape != (0,): return - # TODO: should we fake a chain dimension? with open(self._runset.csv_files[0], 'r') as fd: lines = (line for line in fd if not line.startswith('#')) self._draws = np.loadtxt( diff --git a/test/test_laplace.py b/test/test_laplace.py index eca0ce34..c7e46e55 100644 --- a/test/test_laplace.py +++ b/test/test_laplace.py @@ -46,6 +46,18 @@ def test_laplace_bad_jacobian_mismatch(): ) +def test_laplace_bad_two_modes(): + model_file = os.path.join(DATAFILES_PATH, 'optimize', 'rosenbrock.stan') + model = cmdstanpy.CmdStanModel(stan_file=model_file) + with pytest.raises(ValueError): + model.laplace_sample( + data={}, + mode=os.path.join(DATAFILES_PATH, 'optimize', 'rosenbrock_mle.csv'), + opt_args={'iter': 1003}, + jacobian=False, + ) + + def test_laplace_outputs(): model_file = os.path.join(DATAFILES_PATH, 'optimize', 'rosenbrock.stan') model = cmdstanpy.CmdStanModel(stan_file=model_file)