From 674ee9dedf4b40108b4b049ae42110fd32789546 Mon Sep 17 00:00:00 2001 From: merliseclyde Date: Tue, 1 May 2018 17:05:13 -0400 Subject: [PATCH] fix issue #17 --- DESCRIPTION | 2 +- NEWS.md | 2 + R/bas.R | 1 + inst/doc/BAS-vignette.R | 161 -------- inst/doc/BAS-vignette.Rmd | 390 ------------------ inst/doc/BAS-vignette.html | 669 ------------------------------ src/bayesreg.c | 21 +- vignettes/BAS-vignette.R | 25 +- vignettes/BAS-vignette.Rmd | 1 + vignettes/BAS-vignette.html | 788 +++++++++++++++++++++++------------- 10 files changed, 530 insertions(+), 1530 deletions(-) delete mode 100644 inst/doc/BAS-vignette.R delete mode 100644 inst/doc/BAS-vignette.Rmd delete mode 100644 inst/doc/BAS-vignette.html diff --git a/DESCRIPTION b/DESCRIPTION index f9cac774..f5891c6c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BAS Version: 1.5.0 -Date: 2018-04-25 +Date: 2018-05-01 Title: Bayesian Variable Selection and Model Averaging using Bayesian Adaptive Sampling Authors@R: c(person("Merlise", "Clyde", email="clyde@duke.edu", role=c("aut","cre", "cph"), diff --git a/NEWS.md b/NEWS.md index 2fbb8722..aa7784c9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,8 @@ * fixed problem if there is only one model for `image` function; github [issue #11](https://github.com/merliseclyde/BAS/issues/11) +* fixed error in `bas.lm` with non-equal weights where R2 was incorrect. + [issue #17](https://github.com/merliseclyde/BAS/issues/17) ## Deprecated * deprecate the `predict` argument in `predict.bas`, `predict.basglm` and internal functions as it is not utilized diff --git a/R/bas.R b/R/bas.R index a3e1f5b6..8e91a542 100644 --- a/R/bas.R +++ b/R/bas.R @@ -374,6 +374,7 @@ #' \dontrun{demo(BAS.USCrime) } #' #' @rdname bas.lm +#' @keywords regression #' @family BAS methods #' @concept BMA #' @concept variable selection diff --git a/inst/doc/BAS-vignette.R b/inst/doc/BAS-vignette.R deleted file mode 100644 index 4cb53b7d..00000000 --- a/inst/doc/BAS-vignette.R +++ /dev/null @@ -1,161 +0,0 @@ -## ----setup, include=FALSE------------------------------------------------ -#require(knitr) -require(MASS) - -## ----data---------------------------------------------------------------- -library(MASS) -data(UScrime) - -## ----transform----------------------------------------------------------- -UScrime[,-2] = log(UScrime[,-2]) - -## ----bas----------------------------------------------------------------- -library(BAS) -crime.ZS = bas.lm(y ~ ., - data=UScrime, - prior="ZS-null", - modelprior=uniform(), initprobs="eplogp") - -## ---- fig.show='hold'---------------------------------------------------- -plot(crime.ZS, ask=F) - - -## ----pip, fig.width=5, fig.height=5-------------------------------------- -plot(crime.ZS, which = 4, ask=FALSE, caption="", sub.caption="") - -## ----print--------------------------------------------------------------- -crime.ZS - -## ----summary------------------------------------------------------------------ -options(width = 80) -summary(crime.ZS) - -## ----image, fig.width=5, fig.height=5----------------------------------------- -image(crime.ZS, rotate=F) - -## ----coef--------------------------------------------------------------------- -coef.ZS = coef(crime.ZS) - - -## ----plot--------------------------------------------------------------------- -plot(coef.ZS, subset=c(5:6), ask=F) - -## ----coefall------------------------------------------------------------------ - -plot(coef.ZS, ask=FALSE) - -## ----confint-coef------------------------------------------------------------- - -confint(coef.ZS) - -## ----plot-confint, fig.width=7------------------------------------------------ -plot(confint(coef.ZS, parm=2:16)) - -## ---- warning=FALSE, fig.width=7--------------------------------------------- -plot(confint(coef(crime.ZS, estimator="HPM"))) - -## ---- warning=FALSE, fig.width=7--------------------------------------------- -plot(confint(coef(crime.ZS, estimator="MPM"))) - -## ----choice of estimator------------------------------------------------------ -muhat.BMA = fitted(crime.ZS, estimator="BMA") -BMA = predict(crime.ZS, estimator="BMA") - -# predict has additional slots for fitted values under BMA, predictions under each model -names(BMA) - -## ---- fig.width=5, fig.height=5----------------------------------------------- -par(mar=c(9, 9, 3, 3)) -plot(muhat.BMA, BMA$fit, - pch=16, - xlab=expression(hat(mu[i])), ylab=expression(hat(Y[i]))) -abline(0,1) - -## ----HPM---------------------------------------------------------------------- -HPM = predict(crime.ZS, estimator="HPM") - -# show the indices of variables in the best model where 0 is the intercept -HPM$bestmodel - -## ----------------------------------------------------------------------------- -(crime.ZS$namesx[HPM$bestmodel +1])[-1] - -## ----MPM---------------------------------------------------------------------- -MPM = predict(crime.ZS, estimator="MPM") -(crime.ZS$namesx[attr(MPM$fit, 'model') +1])[-1] - -## ----BPM---------------------------------------------------------------------- -BPM = predict(crime.ZS, estimator="BPM") -(crime.ZS$namesx[attr(BPM$fit, 'model') +1])[-1] - -## ---- fig.width=6, fig.height=6----------------------------------------------- -GGally::ggpairs(data.frame(HPM = as.vector(HPM$fit), #this used predict so we need to extract fitted values - MPM = as.vector(MPM$fit), # this used fitted - BPM = as.vector(BPM$fit), # this used fitted - BMA = as.vector(BMA$fit))) # this used predict - -## ----se, fig.width=7---------------------------------------------------------- -BPM = predict(crime.ZS, estimator="BPM", se.fit=TRUE) -crime.conf.fit = confint(BPM, parm="mean") -crime.conf.pred = confint(BPM, parm="pred") -plot(crime.conf.fit) -plot(crime.conf.pred) - -## ----pred--------------------------------------------------------------------- - -new.pred = predict(crime.ZS, newdata=UScrime, estimator="MPM") - -## ----------------------------------------------------------------------------- -system.time( - for (i in 1:10) { - crime.ZS <- bas.lm(y ~ ., - data=UScrime, - prior="ZS-null", method="BAS", - modelprior=uniform(), initprobs="eplogp") - } -) - -system.time( - for (i in 1:10) { - crime.ZS <- bas.lm(y ~ ., - data=UScrime, - prior="ZS-null", method="deterministic", - modelprior=uniform(), initprobs="eplogp") - } -) - - -## ----MCMC--------------------------------------------------------------------- -crime.ZS = bas.lm(y ~ ., - data=UScrime, - prior="ZS-null", - modelprior=uniform(), - method = "MCMC") - -## ----diagnostics-------------------------------------------------------------- -diagnostics(crime.ZS, type="pip", pch=16) -diagnostics(crime.ZS, type="model", pch=16) - -## ----biggerMCMC, eval=FALSE--------------------------------------------------- -# crime.ZS = bas.lm(y ~ ., -# data=UScrime, -# prior="ZS-null", -# modelprior=uniform(), -# method = "MCMC", MCMC.iterations = 10^6) -# -# # Don't run diagnostics(crime.ZS, type="model", pch=16) - -## ----add-out------------------------------------------------------------------ -data("stackloss") -stackloss$out.ind = diag(nrow(stackloss)) - -stack.bas = bas.lm(stack.loss ~ ., data=stackloss, - method="MCMC", initprobs="marg-eplogp", - prior="ZS-null", - modelprior=tr.poisson(4,10), - MCMC.iterations=200000 - ) - -## ----------------------------------------------------------------------------- -knitr::kable(as.data.frame(summary(stack.bas))) - diff --git a/inst/doc/BAS-vignette.Rmd b/inst/doc/BAS-vignette.Rmd deleted file mode 100644 index 189050fe..00000000 --- a/inst/doc/BAS-vignette.Rmd +++ /dev/null @@ -1,390 +0,0 @@ ---- -title: "Using the Bayesian Adaptive Sampling (BAS) Package for Bayesian Model Averaging and Variable Selection" -author: "Merlise A Clyde" -date: "`r Sys.Date()`" -r_packages: - - rmarkdown -output: rmarkdown::html_vignette -vignette: > - %\VignetteIndexEntry{Using the Bayesian Adaptive Sampling (BAS) Package for Bayesian Model Averaging and Variable Selection} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- -```{r setup, include=FALSE} -#require(knitr) -require(MASS) -``` - -The `BAS` package provides easy to use functions to implement Bayesian Model Averaging in linear models and generalized linear models. -Prior distributions on coefficients are -based on Zellner's g-prior or mixtures of g-priors, such as the -Zellner-Siow Cauchy prior or mixtures of g-priors from -[Liang et al (2008)](http://dx.doi.org/10.1198/016214507000001337) -for linear models, as well as other options including AIC, BIC, RIC and Empirical Bayes methods. Extensions to Generalized Linear Models are based on the -mixtures of g-priors in GLMs of -[Li and Clyde (2015)](http://arxiv.org/abs/1503.06913) using an integrated -Laplace approximation. - -`BAS` uses an adaptive sampling algorithm to sample without replacement from the space of models or MCMC sampling which is recommended for sampling problems with a large number of predictors. See [Clyde, Littman & Ghosh](http://dx.doi.org/10.1198/jcgs.2010.09049) for more details for the sampling algorithms. - -## Installing BAS - -The stable version can be installed easily in the `R` console like any other package: - -```r -install.packages('BAS') -``` - -On the other hand, I welcome everyone to use the most recent version -of the package with quick-fixes, new features and probably new -bugs. To get the latest -development version from [GitHub](https://github.com/merliseclyde), -use the `devtools` package from -[CRAN](https://cran.r-project.org/package=devtools) and enter in `R`: - -```r -# devtools::install_github('merliseclyde/BAS') -``` - -As the package does depend on BLAS and LAPACK, installing from GitHub will require that you have FORTRAN and C compilers on your system. - -## Demo - -We will use the UScrime data to illustrate some of the commands and functionality. - -```{r data} -library(MASS) -data(UScrime) -``` - -Following other analyses, we will go ahead and log transform all of the variables except column 2, which is the indicator variable of the state being a southern state. - -```{r transform} -UScrime[,-2] = log(UScrime[,-2]) -``` - -To get started, we will use `BAS` with the Zellner-Siow Cauchy prior on the coefficients. - -```{r bas} -library(BAS) -crime.ZS = bas.lm(y ~ ., - data=UScrime, - prior="ZS-null", - modelprior=uniform(), initprobs="eplogp") -``` - -`BAS` uses a model formula similar to `lm` to specify the full model with all of the potential predictors. Here we are using the shorthand `.` to indicate that all remaining variables in the data frame will be included. `BAS` require a data frame as the input for the `data` argument. Different prior distributions on the regression coefficients may be specified using the `prior` argument, and include - - * "BIC" - * "AIC - * "g-prior" - * "hyper-g" - * "hyper-g-laplace" - * "hyper-g-n" - * "ZS-null" - * "ZS-full" - * "EB-local" - * "EB-global" - -where the default is the Zellner-Siow prior, ZS-null, where all Bayes factors are compared to the null model. - -By default, `BAS` will try to enumerate all models, in this case $2^{15}$ using the default `method="BAS"`. The prior distribution over the models is a `uniform()` distribution which assigns equal probabilities to all models. The last optional argument `initprobs = eplogp` provides a way to initialize the sampling algorithm and order the variables in the tree structure that represents the model space in BAS. The `eplogp` option uses the Bayes factor calibration of p-values $-e p \log(p)$ to provide an approximation to the marginal inclusion probability that the coefficient of each predictor is zero, using the p-values from the full model. Other options for `initprobs` include - - * "marg-eplogp"" - * "uniform" - * numeric vector of length p - -The option "marg-eplogp" uses p-values from the p simple linear regressions (useful for large p or highly correlated variables). -The numeric provides a way to force variables to always be included if the initialprob is 1. - -Since we are enumerating under all possible models these options are not important. - -## Plots - -Some graphical summaries of the output may be obtained by the `plot` function - -```{r, fig.show='hold'} -plot(crime.ZS, ask=F) - -``` - -which produces a panel of four plots. The first is a plot of residuals and fitted values under Bayesian Model Averaging. Ideally, of our model assumptions hold, we will not see outliers or non-constant variance. The second plot shows the cumulative probability of the models in the order that they are sampled. This plot indicates that the cumulative probability is leveling off as each additional model adds only a small increment to the cumulative probability, which earlier, there are larger jumps corresponding to sampling high probability models. The third plot shows the dimension of each model (the number of regression coefficients including the intercept) versus the log of the marginal likelihood of the model. The last plot shows the marginal posterior inclusion probabilities (pip) for each of the covariates, with marginal pips greater than 0.5 shown in red. The variables with pip > 0.5 correspond to what is known as the median probability model. Variables with high inclusion probabilities are generally important for explaining the data or prediction, but marginal inclusion probabilities may be small if there predictors are correlated, similar to how p-values may be large in the presence of mullticollinearity. - -Individual plots may be obtained using the `which` option. -```{r pip, fig.width=5, fig.height=5} -plot(crime.ZS, which = 4, ask=FALSE, caption="", sub.caption="") -``` - -`BAS` has `print` and `summary` methods defined for objects of class `bas`. Typing the objects name -```{r print} -crime.ZS -``` - -returns a summary of the marginal inclusion probabilities, while the `summary` function -provides -```{r summary} -options(width = 80) -summary(crime.ZS) -``` -This lists the top 5 models (in terms of posterior probability) with the zero-one indicators for variable inclusion. The other columns in the summary are the Bayes factor of each model to the highest probability model (hence its Bayes factor is 1), the posterior probabilities of the models, the ordinary $R^2$ of the models, the dimension of the models (number of coefficients including the intercept) and the log marginal likelihood under the selected prior distribution. - -## Visualization of the Model Space - -To see beyond the first five models, we can represent the collection of the models via an `image` plot. By default this shows the top 20 models. - -```{r image, fig.width=5, fig.height=5} -image(crime.ZS, rotate=F) -``` - -This image has rows that correspond to each of the variables and intercept, with labels for the variables on the y-axis. The x-axis corresponds to the possible models. These are sorted by their posterior probability from best at the left -to worst at the right with the rank on the top x-axis. - -Each column represents one of the 16 models. The variables -that are excluded in a model are shown in black for each column, while -the variables that are included are colored, with the color related to -the log posterior probability. -The color of each column is proportional to the log of the posterior probabilities (the lower x-axis) of that model. Models that are the same color have similar log posterior probabilities which allows us to view models that are clustered together that have marginal likelihoods where the differences are not "worth a bare mention". - -This plot indicates that the police expenditure in the two years do not enter the model together, and is an indication of the high correlation between the two variables. - -## Posterior Distributions of Coefficients - -To examine the marginal distributions of the two coefficients for the police expenditures, we can extract the coefficients estimates and standard deviations under BMA. - -```{r coef} -coef.ZS = coef(crime.ZS) - -``` -an optional argument, `n.models` to `coef` will use only the top `n.models` for BMA and may be more computationally efficient for large problems. - -Plots the posterior distributions averaging over all of the models are obtained using the `plot` method. - -```{r plot} -plot(coef.ZS, subset=c(5:6), ask=F) -``` - -The vertical bar represents the posterior probability that the coefficient is 0 while -the bell shaped curve represents the density of plausible values from all the models where the coefficient is non-zero. This is scaled so that the height of the density for non-zero values is the probability that the coefficient is non-zero. - -Omitting the `subset` argument provides all of the marginal distributions - -```{r coefall} - -plot(coef.ZS, ask=FALSE) -``` - - -To obtain credible intervals for coefficients, `BAS` includes a `confint` method to create Highest Posterior Density intervals from the summaries from `coef`. -```{r confint-coef} - -confint(coef.ZS) -``` -where the third column is the posterior mean. -This uses Monte Carlo sampling to draw from the mixture model over coefficient where models are sampled based on their posterior probabilities. - -We can also plot these via -```{r plot-confint, fig.width=7} -plot(confint(coef.ZS, parm=2:16)) -``` -using the `parm` argument to select which coefficients to plot (the intercept is `parm=1`). - -For estimation under selection, `BAS` supports additional arguments -via `estimator`. The default is `estimator="BMA"` which uses all models or `n.models`. Other options include estimation under the highest probability model - -```{r, warning=FALSE, fig.width=7} -plot(confint(coef(crime.ZS, estimator="HPM"))) -``` - -or the median probability model - -```{r, warning=FALSE, fig.width=7} -plot(confint(coef(crime.ZS, estimator="MPM"))) -``` - -where variables that are excluded have distributions that are point masses at zero under selection. - -## Prediction - -`BAS` has methods defined to return fitted values, `fitted`, using the observed design matrix and predictions at either the observed data or potentially new values, `predict`, as with `lm`. - -```{r choice of estimator} -muhat.BMA = fitted(crime.ZS, estimator="BMA") -BMA = predict(crime.ZS, estimator="BMA") - -# predict has additional slots for fitted values under BMA, predictions under each model -names(BMA) -``` - -Plotting the two sets of fitted values, -```{r, fig.width=5, fig.height=5} -par(mar=c(9, 9, 3, 3)) -plot(muhat.BMA, BMA$fit, - pch=16, - xlab=expression(hat(mu[i])), ylab=expression(hat(Y[i]))) -abline(0,1) -``` - -we see that they are in perfect agreement. That is always the case as the posterior mean for the regression mean function at a point $x$ is the expected posterior predictive value for $Y$ at $x$. This is true not only for estimators such as BMA, but the expected values under model selection. - -### Inference with model selection ### - -In addition to using BMA, we can use the posterior means under model selection. This corresponds to a decision rule that combines estimation and selection. `BAS` currently implements the following options - - -**highest probability model:** - -```{r HPM} -HPM = predict(crime.ZS, estimator="HPM") - -# show the indices of variables in the best model where 0 is the intercept -HPM$bestmodel -``` - -A little more interpretable version with names: -```{r} -(crime.ZS$namesx[HPM$bestmodel +1])[-1] -``` - -**median probability model:** -```{r MPM} -MPM = predict(crime.ZS, estimator="MPM") -(crime.ZS$namesx[attr(MPM$fit, 'model') +1])[-1] -``` - -This is the model where all predictors have an inclusion probability greater than or equal to 0.5. This coincides with the HPM if the predictors are all mutually orthogonal, and in this case is the best predictive model under squared error loss. - -Note that we can also extract the best model from the attribute in the fitted values as well. - -**best predictive model:** - -In general, the HPM or MPM are not the best predictive models, which from a Bayesian decision theory perspective would be the model that is closest to BMA predictions under squared error loss. -```{r BPM} -BPM = predict(crime.ZS, estimator="BPM") -(crime.ZS$namesx[attr(BPM$fit, 'model') +1])[-1] -``` - -Let's see how they compare: - -```{r, fig.width=6, fig.height=6} -GGally::ggpairs(data.frame(HPM = as.vector(HPM$fit), #this used predict so we need to extract fitted values - MPM = as.vector(MPM$fit), # this used fitted - BPM = as.vector(BPM$fit), # this used fitted - BMA = as.vector(BMA$fit))) # this used predict -``` - - -Using the `se.fit = TRUE` option with `predict` we can also calculate standard deviations for prediction or for the mean and use this as input for the `confint` function for the prediction object. - -```{r se, fig.width=7} -BPM = predict(crime.ZS, estimator="BPM", se.fit=TRUE) -crime.conf.fit = confint(BPM, parm="mean") -crime.conf.pred = confint(BPM, parm="pred") -plot(crime.conf.fit) -plot(crime.conf.pred) -``` - -For prediction at new points, we can supply a new dataframe to the predict function as in `lm`. -```{r pred} - -new.pred = predict(crime.ZS, newdata=UScrime, estimator="MPM") -``` - -## Alternative algorithms - -`BAS` has several options for sampling from the model space with or without enumeration. The (current) default `method="BAS"` samples models without replacement using estimates of the marginal inclusion probabilities using the algorithm described in [http://dx.doi.org/10.1198/jcgs.2010.09049](Clyde et al 2011). The initial sampling probabilities provided by `initprobs` are updated based on the sampled models, every `update` iterations. -This can be more efficient in some cases if a large fraction of the model space has been sampled, however, in cases of high correlation and a large number of predictors, this can lead to biased estimates [http://dx.doi.org/10.1093/biomet/ass040](Clyde and Ghosh 20112), in which case MCMC is preferred. The `method="MCMC"` is described below and is better for large $p$. - -A deterministic sampling scheme is also available for enumeration; - -```{r} -system.time( - for (i in 1:10) { - crime.ZS <- bas.lm(y ~ ., - data=UScrime, - prior="ZS-null", method="BAS", - modelprior=uniform(), initprobs="eplogp") - } -) - -system.time( - for (i in 1:10) { - crime.ZS <- bas.lm(y ~ ., - data=UScrime, - prior="ZS-null", method="deterministic", - modelprior=uniform(), initprobs="eplogp") - } -) - -``` -which is faster for enumeration than the default method="BAS". - -## Beyond Enumeration - -Many problems are too large to enumerate all possible models. In such cases we may use the `method="BAS"` to sample without replacement or the `method="MCMC"` option to sample models using Markov Chain Monte Carlo sampling to sample models based on their posterior probabilities. For spaces where the number of models greatly exceeds the number of models to sample, the MCMC option tends to provide estimates with low bias compared to the sampling without replacement. - -```{r MCMC} -crime.ZS = bas.lm(y ~ ., - data=UScrime, - prior="ZS-null", - modelprior=uniform(), - method = "MCMC") -``` - -This will run the MCMC sampler until the number of unique sampled models exceeds `n.models` which is $2^p$ (if $p < 19$) by default or until `MCMC.iterations` has been exceeded, where `MCMC.iterations = n.models*2` by default. - -### Estimates of Marginal Posterior Inclusion Probabilities (pip) ### - -With MCMC sampling there are two estimates of the marginal inclusion probabilities: `object$probne0` which are obtained by using the re-normalized posterior odds from sampled models to estimate probabilities and the estimates based on Monte Carlo frequencies `object$probs.MCMC`. These should be in close agreement if the MCMC sampler has run for enough iterations. - -`BAS` includes a diagnostic function to compare the two sets of estimates of posterior inclusion probabilities and posterior model probabilities -```{r diagnostics} -diagnostics(crime.ZS, type="pip", pch=16) -diagnostics(crime.ZS, type="model", pch=16) -``` - -In the left hand plot of pips, each point represents one posterior inclusion probability for the 15 variables estimated under the two methods. The two estimators are in pretty close agreement. The plot of the model probabilities suggests that we should use more `MCMC.iterations` if we want more accurate estimates of the posterior model probabilities. - -```{r biggerMCMC, eval=FALSE} -crime.ZS = bas.lm(y ~ ., - data=UScrime, - prior="ZS-null", - modelprior=uniform(), - method = "MCMC", MCMC.iterations = 10^6) - -# Don't run diagnostics(crime.ZS, type="model", pch=16) -``` - - -## Outliers - -BAS can also be used for exploring mean shift or variance inflation outliers by adding indicator variables for each case being an outlier (the mean is not given by the regression) or not. This is similar to the MC3.REG function in BMA, although here we are using a g-prior or mixture of g-priors for the coefficients for the outlier means. - -Using the Stackloss data, we can add an identify matrix to the original dataframe, where each column is an indicator that the ith variable is an outlier. - -```{r add-out} -data("stackloss") -stackloss$out.ind = diag(nrow(stackloss)) - -stack.bas = bas.lm(stack.loss ~ ., data=stackloss, - method="MCMC", initprobs="marg-eplogp", - prior="ZS-null", - modelprior=tr.poisson(4,10), - MCMC.iterations=200000 - ) -``` - -The above call introduces using truncated prior distributions on the model space; in this case the distribution of the number of variables to be included has a Poisson distribution, with mean 4 (under no truncation), and the truncation point is at 10, so that all models with more than 10 (one half of cases rounded down) will have probability zero. - -Looking at the summaries -```{r} -knitr::kable(as.data.frame(summary(stack.bas))) -``` - - - -## Summary - -`BAS` includes other prior distributions on coefficients and models, as well as `bas.glm` for fitting Generalized Linear Models. The syntax for bas.glm and bas.lm are not yet the same, particularly for how some of the priors on coefficients are represented, so please see the documentation for more features and details until this is updated! - -For issues or feature requests please submit via the package's github page -[merliseclyde/BAS](http://github.com/merliseclyde/BAS) diff --git a/inst/doc/BAS-vignette.html b/inst/doc/BAS-vignette.html deleted file mode 100644 index 09ed8b5f..00000000 --- a/inst/doc/BAS-vignette.html +++ /dev/null @@ -1,669 +0,0 @@ - - - - - - - - - - - - - - - - -Using the Bayesian Adaptive Sampling (BAS) Package for Bayesian Model Averaging and Variable Selection - - - - - - - - - - - - - - - - - -

Using the Bayesian Adaptive Sampling (BAS) Package for Bayesian Model Averaging and Variable Selection

-

Merlise A Clyde

-

2017-06-23

- - - -

The BAS package provides easy to use functions to implement Bayesian Model Averaging in linear models and generalized linear models. Prior distributions on coefficients are based on Zellner’s g-prior or mixtures of g-priors, such as the Zellner-Siow Cauchy prior or mixtures of g-priors from Liang et al (2008) for linear models, as well as other options including AIC, BIC, RIC and Empirical Bayes methods. Extensions to Generalized Linear Models are based on the mixtures of g-priors in GLMs of Li and Clyde (2015) using an integrated Laplace approximation.

-

BAS uses an adaptive sampling algorithm to sample without replacement from the space of models or MCMC sampling which is recommended for sampling problems with a large number of predictors. See Clyde, Littman & Ghosh for more details for the sampling algorithms.

-
-

Installing BAS

-

The stable version can be installed easily in the R console like any other package:

-
install.packages('BAS')
-

On the other hand, I welcome everyone to use the most recent version of the package with quick-fixes, new features and probably new bugs. To get the latest development version from GitHub, use the devtools package from CRAN and enter in R:

-
# devtools::install_github('merliseclyde/BAS')
-

As the package does depend on BLAS and LAPACK, installing from GitHub will require that you have FORTRAN and C compilers on your system.

-
-
-

Demo

-

We will use the UScrime data to illustrate some of the commands and functionality.

-
library(MASS)
-data(UScrime)
-

Following other analyses, we will go ahead and log transform all of the variables except column 2, which is the indicator variable of the state being a southern state.

-
UScrime[,-2] = log(UScrime[,-2])
-

To get started, we will use BAS with the Zellner-Siow Cauchy prior on the coefficients.

-
library(BAS)
-crime.ZS =  bas.lm(y ~ ., 
-                   data=UScrime,
-                   prior="ZS-null",
-                   modelprior=uniform(), initprobs="eplogp") 
-

BAS uses a model formula similar to lm to specify the full model with all of the potential predictors. Here we are using the shorthand . to indicate that all remaining variables in the data frame will be included. BAS require a data frame as the input for the data argument. Different prior distributions on the regression coefficients may be specified using the prior argument, and include

- -

where the default is the Zellner-Siow prior, ZS-null, where all Bayes factors are compared to the null model.

-

By default, BAS will try to enumerate all models, in this case \(2^{15}\) using the default method="BAS". The prior distribution over the models is a uniform() distribution which assigns equal probabilities to all models. The last optional argument initprobs = eplogp provides a way to initialize the sampling algorithm and order the variables in the tree structure that represents the model space in BAS. The eplogp option uses the Bayes factor calibration of p-values \(-e p \log(p)\) to provide an approximation to the marginal inclusion probability that the coefficient of each predictor is zero, using the p-values from the full model. Other options for initprobs include

- -

The option “marg-eplogp” uses p-values from the p simple linear regressions (useful for large p or highly correlated variables). The numeric provides a way to force variables to always be included if the initialprob is 1.

-

Since we are enumerating under all possible models these options are not important.

-
-
-

Plots

-

Some graphical summaries of the output may be obtained by the plot function

-
plot(crime.ZS, ask=F)
-

-

which produces a panel of four plots. The first is a plot of residuals and fitted values under Bayesian Model Averaging. Ideally, of our model assumptions hold, we will not see outliers or non-constant variance. The second plot shows the cumulative probability of the models in the order that they are sampled. This plot indicates that the cumulative probability is leveling off as each additional model adds only a small increment to the cumulative probability, which earlier, there are larger jumps corresponding to sampling high probability models. The third plot shows the dimension of each model (the number of regression coefficients including the intercept) versus the log of the marginal likelihood of the model. The last plot shows the marginal posterior inclusion probabilities (pip) for each of the covariates, with marginal pips greater than 0.5 shown in red. The variables with pip > 0.5 correspond to what is known as the median probability model. Variables with high inclusion probabilities are generally important for explaining the data or prediction, but marginal inclusion probabilities may be small if there predictors are correlated, similar to how p-values may be large in the presence of mullticollinearity.

-

Individual plots may be obtained using the which option.

-
plot(crime.ZS, which = 4, ask=FALSE, caption="", sub.caption="")
-

-

BAS has print and summary methods defined for objects of class bas. Typing the objects name

-
crime.ZS
-
## 
-## Call:
-## bas.lm(formula = y ~ ., data = UScrime, prior = "ZS-null", modelprior = uniform(),     initprobs = "eplogp")
-## 
-## 
-##  Marginal Posterior Inclusion Probabilities: 
-## Intercept          M         So         Ed        Po1        Po2  
-##    1.0000     0.8498     0.2704     0.9735     0.6643     0.4477  
-##        LF        M.F        Pop         NW         U1         U2  
-##    0.1988     0.2016     0.3653     0.6882     0.2485     0.6089  
-##       GDP       Ineq       Prob       Time  
-##    0.3546     0.9964     0.8955     0.3657
-

returns a summary of the marginal inclusion probabilities, while the summary function provides

-
options(width = 80)
-summary(crime.ZS)
-
##           P(B != 0 | Y)  model 1    model 2    model 3    model 4   model 5
-## Intercept     1.0000000  1.00000  1.0000000  1.0000000  1.0000000  1.000000
-## M             0.8497938  1.00000  1.0000000  1.0000000  1.0000000  1.000000
-## So            0.2703865  0.00000  0.0000000  0.0000000  0.0000000  0.000000
-## Ed            0.9734987  1.00000  1.0000000  1.0000000  1.0000000  1.000000
-## Po1           0.6642506  1.00000  1.0000000  0.0000000  1.0000000  1.000000
-## Po2           0.4477211  0.00000  0.0000000  1.0000000  0.0000000  0.000000
-## LF            0.1987747  0.00000  0.0000000  0.0000000  0.0000000  0.000000
-## M.F           0.2015977  0.00000  0.0000000  0.0000000  0.0000000  0.000000
-## Pop           0.3653004  0.00000  0.0000000  0.0000000  1.0000000  0.000000
-## NW            0.6881824  1.00000  1.0000000  1.0000000  1.0000000  0.000000
-## U1            0.2484557  0.00000  0.0000000  0.0000000  0.0000000  0.000000
-## U2            0.6088983  1.00000  1.0000000  1.0000000  1.0000000  1.000000
-## GDP           0.3545607  0.00000  0.0000000  0.0000000  0.0000000  0.000000
-## Ineq          0.9964071  1.00000  1.0000000  1.0000000  1.0000000  1.000000
-## Prob          0.8955326  1.00000  1.0000000  1.0000000  1.0000000  1.000000
-## Time          0.3657243  1.00000  0.0000000  0.0000000  0.0000000  0.000000
-## BF                   NA  1.00000  0.9643544  0.6523547  0.5944565  0.559408
-## PostProbs            NA  0.01820  0.0176000  0.0119000  0.0108000  0.010200
-## R2                   NA  0.84200  0.8265000  0.8229000  0.8375000  0.804600
-## dim                  NA  9.00000  8.0000000  8.0000000  9.0000000  7.000000
-## logmarg              NA 23.86818 23.8318875 23.4410171 23.3480762 23.287308
-

This lists the top 5 models (in terms of posterior probability) with the zero-one indicators for variable inclusion. The other columns in the summary are the Bayes factor of each model to the highest probability model (hence its Bayes factor is 1), the posterior probabilities of the models, the ordinary \(R^2\) of the models, the dimension of the models (number of coefficients including the intercept) and the log marginal likelihood under the selected prior distribution.

-
-
-

Visualization of the Model Space

-

To see beyond the first five models, we can represent the collection of the models via an image plot. By default this shows the top 20 models.

-
image(crime.ZS, rotate=F)
-

-

This image has rows that correspond to each of the variables and intercept, with labels for the variables on the y-axis. The x-axis corresponds to the possible models. These are sorted by their posterior probability from best at the left to worst at the right with the rank on the top x-axis.

-

Each column represents one of the 16 models. The variables that are excluded in a model are shown in black for each column, while the variables that are included are colored, with the color related to the log posterior probability. The color of each column is proportional to the log of the posterior probabilities (the lower x-axis) of that model. Models that are the same color have similar log posterior probabilities which allows us to view models that are clustered together that have marginal likelihoods where the differences are not “worth a bare mention”.

-

This plot indicates that the police expenditure in the two years do not enter the model together, and is an indication of the high correlation between the two variables.

-
-
-

Posterior Distributions of Coefficients

-

To examine the marginal distributions of the two coefficients for the police expenditures, we can extract the coefficients estimates and standard deviations under BMA.

-
coef.ZS = coef(crime.ZS)
-

an optional argument, n.models to coef will use only the top n.models for BMA and may be more computationally efficient for large problems.

-

Plots the posterior distributions averaging over all of the models are obtained using the plot method.

-
plot(coef.ZS, subset=c(5:6),  ask=F)
-

-

The vertical bar represents the posterior probability that the coefficient is 0 while the bell shaped curve represents the density of plausible values from all the models where the coefficient is non-zero. This is scaled so that the height of the density for non-zero values is the probability that the coefficient is non-zero.

-

Omitting the subset argument provides all of the marginal distributions

-
plot(coef.ZS, ask=FALSE)
-

-

To obtain credible intervals for coefficients, BAS includes a confint method to create Highest Posterior Density intervals from the summaries from coef.

-
confint(coef.ZS)
-
##                    2.5%       97.5%        beta
-## Intercept  6.6651800672 6.780443181  6.72493620
-## M          0.0000000000 2.202669116  1.13858899
-## So        -0.0537147430 0.318492329  0.03520844
-## Ed         0.6088825181 3.172591130  1.85357935
-## Po1        0.0000000000 1.448189712  0.60192446
-## Po2       -0.1812437569 1.439477400  0.32019118
-## LF        -0.4332690346 1.039030829  0.05825902
-## M.F       -2.1007200294 2.055510539 -0.02158228
-## Pop       -0.1292704858 0.003550552 -0.02225960
-## NW        -0.0001244719 0.164949444  0.06607314
-## U1        -0.5304137200 0.341493016 -0.02368273
-## U2        -0.0004972749 0.676993108  0.20472599
-## GDP       -0.0529040553 1.189023806  0.20322019
-## Ineq       0.6722249399 2.131583144  1.39068095
-## Prob      -0.4123763948 0.000000000 -0.21400173
-## Time      -0.5328555936 0.043678384 -0.08268477
-## attr(,"Probability")
-## [1] 0.95
-## attr(,"class")
-## [1] "confint.bas"
-

where the third column is the posterior mean. This uses Monte Carlo sampling to draw from the mixture model over coefficient where models are sampled based on their posterior probabilities.

-

We can also plot these via

-
plot(confint(coef.ZS, parm=2:16))
-

-
## NULL
-

using the parm argument to select which coefficients to plot (the intercept is parm=1).

-

For estimation under selection, BAS supports additional arguments via estimator. The default is estimator="BMA" which uses all models or n.models. Other options include estimation under the highest probability model

-
plot(confint(coef(crime.ZS, estimator="HPM")))
-

-
## NULL
-

or the median probability model

-
plot(confint(coef(crime.ZS, estimator="MPM")))
-

-
## NULL
-

where variables that are excluded have distributions that are point masses at zero under selection.

-
-
-

Prediction

-

BAS has methods defined to return fitted values, fitted, using the observed design matrix and predictions at either the observed data or potentially new values, predict, as with lm.

-
muhat.BMA = fitted(crime.ZS, estimator="BMA")
-BMA  = predict(crime.ZS, estimator="BMA")
-
-# predict has additional slots for fitted values under BMA, predictions under each model
-names(BMA)
-
##  [1] "fit"         "Ybma"        "Ypred"       "postprobs"   "se.fit"     
-##  [6] "se.pred"     "se.bma.fit"  "se.bma.pred" "df"          "best"       
-## [11] "bestmodel"   "prediction"  "estimator"
-

Plotting the two sets of fitted values,

-
par(mar=c(9, 9, 3, 3))
-plot(muhat.BMA, BMA$fit, 
-     pch=16, 
-     xlab=expression(hat(mu[i])), ylab=expression(hat(Y[i])))
-abline(0,1)
-

-

we see that they are in perfect agreement. That is always the case as the posterior mean for the regression mean function at a point \(x\) is the expected posterior predictive value for \(Y\) at \(x\). This is true not only for estimators such as BMA, but the expected values under model selection.

-
-

Inference with model selection

-

In addition to using BMA, we can use the posterior means under model selection. This corresponds to a decision rule that combines estimation and selection. BAS currently implements the following options

-

highest probability model:

-
HPM = predict(crime.ZS, estimator="HPM")
-
-# show the indices of variables in the best model where 0 is the intercept
-HPM$bestmodel
-
## [1]  0  1  3  4  9 11 13 14 15
-

A little more interpretable version with names:

-
(crime.ZS$namesx[HPM$bestmodel +1])[-1]
-
## [1] "M"    "Ed"   "Po1"  "NW"   "U2"   "Ineq" "Prob" "Time"
-

median probability model:

-
MPM = predict(crime.ZS, estimator="MPM")
-(crime.ZS$namesx[attr(MPM$fit, 'model') +1])[-1]
-
## [1] "M"    "Ed"   "Po1"  "NW"   "U2"   "Ineq" "Prob"
-

This is the model where all predictors have an inclusion probability greater than or equal to 0.5. This coincides with the HPM if the predictors are all mutually orthogonal, and in this case is the best predictive model under squared error loss.

-

Note that we can also extract the best model from the attribute in the fitted values as well.

-

best predictive model:

-

In general, the HPM or MPM are not the best predictive models, which from a Bayesian decision theory perspective would be the model that is closest to BMA predictions under squared error loss.

-
BPM = predict(crime.ZS, estimator="BPM")
-(crime.ZS$namesx[attr(BPM$fit, 'model') +1])[-1]
-
##  [1] "M"    "So"   "Ed"   "Po1"  "Po2"  "M.F"  "NW"   "U2"   "Ineq" "Prob"
-

Let’s see how they compare:

-
GGally::ggpairs(data.frame(HPM = as.vector(HPM$fit),  #this used predict so we need to extract fitted values
-                   MPM = as.vector(MPM$fit),  # this used fitted
-                   BPM = as.vector(BPM$fit),  # this used fitted
-                   BMA = as.vector(BMA$fit))) # this used predict
-

-

Using the se.fit = TRUE option with predict we can also calculate standard deviations for prediction or for the mean and use this as input for the confint function for the prediction object.

-
BPM = predict(crime.ZS, estimator="BPM", se.fit=TRUE)
-crime.conf.fit = confint(BPM, parm="mean")
-crime.conf.pred = confint(BPM, parm="pred")
-plot(crime.conf.fit)
-

-
## NULL
-
plot(crime.conf.pred)
-

-
## NULL
-

For prediction at new points, we can supply a new dataframe to the predict function as in lm.

-
new.pred = predict(crime.ZS, newdata=UScrime, estimator="MPM")
-
-
-
-

Alternative algorithms

-

BAS has several options for sampling from the model space with or without enumeration. The (current) default method="BAS" samples models without replacement using estimates of the marginal inclusion probabilities using the algorithm described in http://dx.doi.org/10.1198/jcgs.2010.09049. The initial sampling probabilities provided by initprobs are updated based on the sampled models, every update iterations. This can be more efficient in some cases if a large fraction of the model space has been sampled, however, in cases of high correlation and a large number of predictors, this can lead to biased estimates http://dx.doi.org/10.1093/biomet/ass040, in which case MCMC is preferred. The method="MCMC" is described below and is better for large \(p\).

-

A deterministic sampling scheme is also available for enumeration;

-
system.time(
-  for (i in 1:10) {
-    crime.ZS <- bas.lm(y ~ ., 
-                   data=UScrime,
-                   prior="ZS-null", method="BAS",
-                   modelprior=uniform(), initprobs="eplogp") 
-  }
-)
-
##    user  system elapsed 
-##  22.215   2.725  26.266
-
system.time(
-  for (i in 1:10)  {
-    crime.ZS <-  bas.lm(y ~ ., 
-                   data=UScrime,
-                   prior="ZS-null", method="deterministic",
-                   modelprior=uniform(), initprobs="eplogp") 
-  }
-)
-
##    user  system elapsed 
-##  19.577   2.433  23.182
-

which is faster for enumeration than the default method=“BAS”.

-
-
-

Beyond Enumeration

-

Many problems are too large to enumerate all possible models. In such cases we may use the method="BAS" to sample without replacement or the method="MCMC" option to sample models using Markov Chain Monte Carlo sampling to sample models based on their posterior probabilities. For spaces where the number of models greatly exceeds the number of models to sample, the MCMC option tends to provide estimates with low bias compared to the sampling without replacement.

-
crime.ZS =  bas.lm(y ~ ., 
-                   data=UScrime,
-                   prior="ZS-null",
-                   modelprior=uniform(),
-                   method = "MCMC") 
-

This will run the MCMC sampler until the number of unique sampled models exceeds n.models which is \(2^p\) (if \(p < 19\)) by default or until MCMC.iterations has been exceeded, where MCMC.iterations = n.models*2 by default.

-
-

Estimates of Marginal Posterior Inclusion Probabilities (pip)

-

With MCMC sampling there are two estimates of the marginal inclusion probabilities: object$probne0 which are obtained by using the re-normalized posterior odds from sampled models to estimate probabilities and the estimates based on Monte Carlo frequencies object$probs.MCMC. These should be in close agreement if the MCMC sampler has run for enough iterations.

-

BAS includes a diagnostic function to compare the two sets of estimates of posterior inclusion probabilities and posterior model probabilities

-
diagnostics(crime.ZS, type="pip",  pch=16)
-

-
diagnostics(crime.ZS, type="model",  pch=16)
-

-

In the left hand plot of pips, each point represents one posterior inclusion probability for the 15 variables estimated under the two methods. The two estimators are in pretty close agreement. The plot of the model probabilities suggests that we should use more MCMC.iterations if we want more accurate estimates of the posterior model probabilities.

-
crime.ZS =  bas.lm(y ~ ., 
-                   data=UScrime,
-                   prior="ZS-null",
-                   modelprior=uniform(),
-                   method = "MCMC", MCMC.iterations = 10^6)  
-
-# Don't run diagnostics(crime.ZS, type="model", pch=16)
-
-
-
-

Outliers

-

BAS can also be used for exploring mean shift or variance inflation outliers by adding indicator variables for each case being an outlier (the mean is not given by the regression) or not. This is similar to the MC3.REG function in BMA, although here we are using a g-prior or mixture of g-priors for the coefficients for the outlier means.

-

Using the Stackloss data, we can add an identify matrix to the original dataframe, where each column is an indicator that the ith variable is an outlier.

-
data("stackloss")
-stackloss$out.ind = diag(nrow(stackloss))
-
-stack.bas = bas.lm(stack.loss ~ ., data=stackloss,
-                method="MCMC", initprobs="marg-eplogp",
-                prior="ZS-null", 
-                modelprior=tr.poisson(4,10),
-                MCMC.iterations=200000
-               )
-

The above call introduces using truncated prior distributions on the model space; in this case the distribution of the number of variables to be included has a Poisson distribution, with mean 4 (under no truncation), and the truncation point is at 10, so that all models with more than 10 (one half of cases rounded down) will have probability zero.

-

Looking at the summaries

-
knitr::kable(as.data.frame(summary(stack.bas)))
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
P(B != 0 | Y)model 1model 2model 3model 4model 5
Intercept1.0000001.00001.00000001.00000001.00000001.00000
Air.Flow0.9996401.00001.00000001.00000001.00000001.00000
Water.Temp0.4592851.00000.00000000.00000001.00000001.00000
Acid.Conc.0.0930300.00000.00000000.00000000.00000000.00000
out.ind10.5845651.00001.00000001.00000001.00000001.00000
out.ind20.1594300.00000.00000000.00000000.00000001.00000
out.ind30.6137351.00001.00000001.00000001.00000001.00000
out.ind40.9514251.00001.00000001.00000001.00000001.00000
out.ind50.0667150.00000.00000000.00000000.00000000.00000
out.ind60.0762150.00000.00000000.00000000.00000000.00000
out.ind70.0667550.00000.00000000.00000000.00000000.00000
out.ind80.0635700.00000.00000000.00000000.00000000.00000
out.ind90.0613550.00000.00000000.00000000.00000000.00000
out.ind100.0677150.00000.00000000.00000000.00000000.00000
out.ind110.0638800.00000.00000000.00000000.00000000.00000
out.ind120.0836750.00000.00000000.00000000.00000000.00000
out.ind130.3306800.00001.00000000.00000001.00000000.00000
out.ind140.1738550.00000.00000000.00000000.00000000.00000
out.ind150.0729100.00000.00000000.00000000.00000000.00000
out.ind160.0571950.00000.00000000.00000000.00000000.00000
out.ind170.0627100.00000.00000000.00000000.00000000.00000
out.ind180.0612550.00000.00000000.00000000.00000000.00000
out.ind190.0959450.00000.00000000.00000000.00000000.00000
out.ind200.1246050.00000.00000000.00000000.00000000.00000
out.ind210.9863001.00001.00000001.00000001.00000001.00000
BFNA1.00000.33975750.20931580.56171730.53992
PostProbsNA0.06860.02390000.02230000.02200000.02010
R2NA0.98920.98730000.98030000.99230000.99220
dimNA7.00007.00000006.00000008.00000008.00000
logmargNA24.099823.020274022.535886023.523040623.48346
-
-
-

Summary

-

BAS includes other prior distributions on coefficients and models, as well as bas.glm for fitting Generalized Linear Models. The syntax for bas.glm and bas.lm are not yet the same, particularly for how some of the priors on coefficients are represented, so please see the documentation for more features and details until this is updated!

-

For issues or feature requests please submit via the package’s github page merliseclyde/BAS

-
- - - - - - - - diff --git a/src/bayesreg.c b/src/bayesreg.c index 0da1c7cd..03810d94 100644 --- a/src/bayesreg.c +++ b/src/bayesreg.c @@ -4,7 +4,7 @@ void PrecomputeData(double *Xwork, double *Ywork, double *wts, double **pXtXwork, double **pXtYwork, double **pXtX, double **pXtY, double *yty, double *SSY, int p, int nobs) { char uplo[] = "U", trans[]="T"; - double one=1.0, zero=0.0; + double one=1.0, zero=0.0, nwts=0.0; int inc=1, i, j,l; int p2 = p * p; @@ -20,8 +20,10 @@ void PrecomputeData(double *Xwork, double *Ywork, double *wts, double **pXtXwork } } +// wts are actully sqrt of weights for (i = 0; i< nobs; i++) { Ywork[i] = Ywork[i]*wts[i]; + *yty += Ywork[i]*Ywork[i]; } //precompute XtX @@ -29,14 +31,14 @@ void PrecomputeData(double *Xwork, double *Ywork, double *wts, double **pXtXwork F77_NAME(dsyrk)(uplo, trans, &p, &nobs, &one, &Xwork[0], &nobs, &zero, *pXtX, &p); double ybar = 0.0; - for (int i = 0; i< nobs; i++) { - ybar += Ywork[i]; - } - ybar = F77_NAME(ddot) (&nobs, &Ywork[0], &inc, &wts[0], &inc)/ - F77_NAME(ddot) (&nobs, &wts[0], &inc, &wts[0], &inc); - *yty = F77_NAME(ddot)(&nobs, &Ywork[0], &inc, &Ywork[0], &inc); // ybar = ybar/ (double) nobs; - *SSY = *yty - (double) nobs* ybar *ybar; + ybar = F77_NAME(ddot)(&nobs, &Ywork[0], &inc, &wts[0], &inc); // sum y*wts^2 + nwts = F77_NAME(ddot)(&nobs, &wts[0], &inc, &wts[0], &inc); // sum wts^2 + ybar = ybar/nwts; + + *yty = F77_NAME(ddot)(&nobs, &Ywork[0], &inc, &Ywork[0], &inc); + *SSY = *yty - (double) nwts* ybar *ybar; + F77_NAME(dgemv)(trans, &nobs, &p, &one, &Xwork[0], &nobs, &Ywork[0], &inc, &zero, *pXtY,&inc); } @@ -207,7 +209,8 @@ void cholreg(double *XtY, double *XtX, double *coefficients, double *se, double /* On entry *coefficients equals X'Y, which is over written with the OLS estimates */ /* On entry MSE = Y'Y */ - double ete, one, zero, tol=100*DBL_EPSILON; + double ete, one, zero; +// double tol=100*DBL_EPSILON; int job, l, i, j, info, inc; zero = 0.0; one = 1.0; diff --git a/vignettes/BAS-vignette.R b/vignettes/BAS-vignette.R index 4a5ca23f..4cb53b7d 100644 --- a/vignettes/BAS-vignette.R +++ b/vignettes/BAS-vignette.R @@ -11,10 +11,10 @@ UScrime[,-2] = log(UScrime[,-2]) ## ----bas----------------------------------------------------------------- library(BAS) -crime.ZS = bas.lm(y ~ ., +crime.ZS = bas.lm(y ~ ., data=UScrime, prior="ZS-null", - modelprior=uniform(), initprobs="eplogp") + modelprior=uniform(), initprobs="eplogp") ## ---- fig.show='hold'---------------------------------------------------- plot(crime.ZS, ask=F) @@ -66,8 +66,8 @@ names(BMA) ## ---- fig.width=5, fig.height=5----------------------------------------------- par(mar=c(9, 9, 3, 3)) -plot(muhat.BMA, BMA$fit, - pch=16, +plot(muhat.BMA, BMA$fit, + pch=16, xlab=expression(hat(mu[i])), ylab=expression(hat(Y[i]))) abline(0,1) @@ -108,29 +108,29 @@ new.pred = predict(crime.ZS, newdata=UScrime, estimator="MPM") ## ----------------------------------------------------------------------------- system.time( for (i in 1:10) { - crime.ZS <- bas.lm(y ~ ., + crime.ZS <- bas.lm(y ~ ., data=UScrime, prior="ZS-null", method="BAS", - modelprior=uniform(), initprobs="eplogp") + modelprior=uniform(), initprobs="eplogp") } ) system.time( for (i in 1:10) { - crime.ZS <- bas.lm(y ~ ., + crime.ZS <- bas.lm(y ~ ., data=UScrime, prior="ZS-null", method="deterministic", - modelprior=uniform(), initprobs="eplogp") + modelprior=uniform(), initprobs="eplogp") } ) ## ----MCMC--------------------------------------------------------------------- -crime.ZS = bas.lm(y ~ ., +crime.ZS = bas.lm(y ~ ., data=UScrime, prior="ZS-null", modelprior=uniform(), - method = "MCMC") + method = "MCMC") ## ----diagnostics-------------------------------------------------------------- diagnostics(crime.ZS, type="pip", pch=16) @@ -142,17 +142,16 @@ diagnostics(crime.ZS, type="model", pch=16) # prior="ZS-null", # modelprior=uniform(), # method = "MCMC", MCMC.iterations = 10^6) -# +# # # Don't run diagnostics(crime.ZS, type="model", pch=16) ## ----add-out------------------------------------------------------------------ data("stackloss") - stackloss$out.ind = diag(nrow(stackloss)) stack.bas = bas.lm(stack.loss ~ ., data=stackloss, method="MCMC", initprobs="marg-eplogp", - prior="ZS-null", + prior="ZS-null", modelprior=tr.poisson(4,10), MCMC.iterations=200000 ) diff --git a/vignettes/BAS-vignette.Rmd b/vignettes/BAS-vignette.Rmd index 167fcf59..538a6788 100644 --- a/vignettes/BAS-vignette.Rmd +++ b/vignettes/BAS-vignette.Rmd @@ -380,6 +380,7 @@ Looking at the summaries knitr::kable(as.data.frame(summary(stack.bas))) ``` +## Weighted Regression ## Summary diff --git a/vignettes/BAS-vignette.html b/vignettes/BAS-vignette.html index 70610a37..72c10f1b 100644 --- a/vignettes/BAS-vignette.html +++ b/vignettes/BAS-vignette.html @@ -12,7 +12,7 @@ - + Using the Bayesian Adaptive Sampling (BAS) Package for Bayesian Model Averaging and Variable Selection @@ -20,46 +20,256 @@ - + @@ -70,7 +280,7 @@

Using the Bayesian Adaptive Sampling (BAS) Package for Bayesian Model Averaging and Variable Selection

Merlise A Clyde

-

2017-06-30

+

2018-05-01

@@ -79,24 +289,24 @@

2017-06-30

Installing BAS

The stable version can be installed easily in the R console like any other package:

-
install.packages('BAS')
+

On the other hand, I welcome everyone to use the most recent version of the package with quick-fixes, new features and probably new bugs. To get the latest development version from GitHub, use the devtools package from CRAN and enter in R:

-
# devtools::install_github('merliseclyde/BAS')
+

As the package does depend on BLAS and LAPACK, installing from GitHub will require that you have FORTRAN and C compilers on your system.

Demo

We will use the UScrime data to illustrate some of the commands and functionality.

-
library(MASS)
-data(UScrime)
+

Following other analyses, we will go ahead and log transform all of the variables except column 2, which is the indicator variable of the state being a southern state.

-
UScrime[,-2] = log(UScrime[,-2])
+

To get started, we will use BAS with the Zellner-Siow Cauchy prior on the coefficients.

-
library(BAS)
-crime.ZS =  bas.lm(y ~ ., 
-                   data=UScrime,
-                   prior="ZS-null",
-                   modelprior=uniform(), initprobs="eplogp") 
+

BAS uses a model formula similar to lm to specify the full model with all of the potential predictors. Here we are using the shorthand . to indicate that all remaining variables in the data frame will be included. BAS require a data frame as the input for the data argument. Different prior distributions on the regression coefficients may be specified using the prior argument, and include