Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perhaps I should turn off "(add-hook 'before-save-hook 'delete-traili… #17

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 53 additions & 54 deletions R/ComBat.R
Original file line number Diff line number Diff line change
@@ -1,77 +1,76 @@
#' Adjust for batch effects using an empirical Bayes framework
#'
#' ComBat allows users to adjust for batch effects in datasets where the batch covariate is known, using methodology
#' described in Johnson et al. 2007. It uses either parametric or non-parametric empirical Bayes frameworks for adjusting data for
#'
#' ComBat allows users to adjust for batch effects in datasets where the batch covariate is known, using methodology
#' described in Johnson et al. 2007. It uses either parametric or non-parametric empirical Bayes frameworks for adjusting data for
#' batch effects. Users are returned an expression matrix that has been corrected for batch effects. The input
#' data are assumed to be cleaned and normalized before batch effect removal.
#'
#' data are assumed to be cleaned and normalized before batch effect removal.
#'
#' @param dat Genomic measure matrix (dimensions probe x sample) - for example, expression matrix
#' @param batch {Batch covariate (only one batch allowed)}
#' @param mod Model matrix for outcome of interest and other covariates besides batch
#' @param par.prior (Optional) TRUE indicates parametric adjustments will be used, FALSE indicates non-parametric adjustments will be used
#' @param prior.plots (Optional) TRUE give prior plots with black as a kernel estimate of the empirical batch effect density and red as the parametric
#' @param mean.only (Optional) FALSE If TRUE ComBat only corrects the mean of the batch effect (no scale adjustment)
#' @param ref.batch (Optional) NULL If given, will use the selected batch as a reference for batch adjustment.
#' @param ref.batch (Optional) NULL If given, will use the selected batch as a reference for batch adjustment.
#' @param BPPARAM (Optional) BiocParallelParam for parallel operation
#'
#' @return data A probe x sample genomic measure matrix, adjusted for batch effects.
#'
#' @examples
#'
#' @examples
#' library(bladderbatch)
#' data(bladderdata)
#' dat <- bladderEset[1:50,]
#'
#'
#' pheno = pData(dat)
#' edata = exprs(dat)
#' batch = pheno$batch
#' mod = model.matrix(~as.factor(cancer), data=pheno)
#'
#'
#' # parametric adjustment
#' combat_edata1 = ComBat(dat=edata, batch=batch, mod=NULL, par.prior=TRUE, prior.plots=FALSE)
#'
#'
#' # non-parametric adjustment, mean-only version
#' combat_edata2 = ComBat(dat=edata, batch=batch, mod=NULL, par.prior=FALSE, mean.only=TRUE)
#'
#'
#' # reference-batch version, with covariates
#' combat_edata3 = ComBat(dat=edata, batch=batch, mod=mod, par.prior=TRUE, ref.batch=3)
#'
#'
#' @export
#'

#'
ComBat <- function (dat, batch, mod = NULL, par.prior = TRUE, prior.plots = FALSE,
mean.only = FALSE, ref.batch = NULL, BPPARAM = bpparam()) {
# make batch a factor and make a set of indicators for batch
if(mean.only==TRUE){cat("Using the 'mean only' version of ComBat\n")}
if(mean.only==TRUE){message("Using the 'mean only' version of ComBat.")}
if(length(dim(batch))>1){stop("This version of ComBat only allows one batch variable")} ## to be updated soon!
batch <- as.factor(batch)
batchmod <- model.matrix(~-1+batch)
batchmod <- model.matrix(~-1+batch)
if (!is.null(ref.batch)){ # check for reference batch, check value, and make appropriate changes
if (!(ref.batch%in%levels(batch))){stop("reference level ref.batch is not one of the levels of the batch variable")}
cat("Using batch =",ref.batch, "as a reference batch (this batch won't change)\n")
message(paste0("Using batch =", ref.batch, "as a reference batch (this batch won't change)."))
ref = which(levels(as.factor(batch))==ref.batch) # find the reference
batchmod[,ref]=1
}else{ref=NULL}
cat("Found",nlevels(batch),'batches\n')
message(paste0("Found", nlevels(batch), "batches."))

# A few other characteristics on the batches
n.batch <- nlevels(batch)
batches <- list()
for (i in 1:n.batch){batches[[i]] <- which(batch == levels(batch)[i])} # list of samples in each batch
for (i in 1:n.batch){batches[[i]] <- which(batch == levels(batch)[i])} # list of samples in each batch
n.batches <- sapply(batches, length)
if(any(n.batches==1)){mean.only=TRUE; cat("Note: one batch has only one sample, setting mean.only=TRUE\n")}
if(any(n.batches==1)){mean.only=TRUE; message("Note: one batch has only one sample, setting mean.only=TRUE.")}
n.array <- sum(n.batches)

#combine batch variable and covariates
design <- cbind(batchmod,mod)

# check for intercept in covariates, and drop if present
check <- apply(design, 2, function(x) all(x == 1))
if(!is.null(ref)){check[ref]=FALSE} ## except don't throw away the reference batch indicator
design <- as.matrix(design[,!check])

# Number of covariates or covariate levels
cat("Adjusting for",ncol(design)-ncol(batchmod),'covariate(s) or covariate level(s)\n')
message(paste0("Adjusting for", ncol(design)-ncol(batchmod), "covariate(s) or covariate level(s).")

# Check if the design is confounded
if(qr(design)$rank<ncol(design)){
#if(ncol(design)<=(n.batch)){stop("Batch variables are redundant! Remove one or more of the batch variables so they are no longer confounded")}
Expand All @@ -80,27 +79,27 @@ ComBat <- function (dat, batch, mod = NULL, par.prior = TRUE, prior.plots = FALS
if((qr(design[,-c(1:n.batch)])$rank<ncol(design[,-c(1:n.batch)]))){stop('The covariates are confounded! Please remove one or more of the covariates so the design is not confounded')
}else{stop("At least one covariate is confounded with batch! Please remove confounded covariates and rerun ComBat")}}
}

## Check for missing values
NAs = any(is.na(dat))
if(NAs){cat(c('Found',sum(is.na(dat)),'Missing Data Values\n'),sep=' ')}
if(NAs){message(paste0("Found ", sum(is.na(dat)), " Missing Data Values."))}
#print(dat[1:2,])

##Standardize Data across genes
cat('Standardizing Data across genes\n')
message("Standardizing Data across genes.")
if (!NAs){
B.hat <- solve(t(design)%*%design)%*%t(design)%*%t(as.matrix(dat))
}else{
B.hat=apply(dat,1,Beta.NA,design)
}

######## change grand.mean for ref batch
if(!is.null(ref.batch)){
grand.mean <- t(B.hat[ref, ])
}else{
grand.mean <- t(n.batches/n.array)%*%B.hat[1:n.batch,]
}

######## change var.pooled for ref batch
if (!NAs){
if(!is.null(ref.batch)){
Expand All @@ -117,36 +116,36 @@ ComBat <- function (dat, batch, mod = NULL, par.prior = TRUE, prior.plots = FALS
var.pooled <- apply(dat-t(design%*%B.hat),1,var,na.rm=TRUE)
}
}

stand.mean <- t(grand.mean)%*%t(rep(1,n.array))
if(!is.null(design)){tmp <- design;tmp[,c(1:n.batch)] <- 0;stand.mean <- stand.mean+t(tmp%*%B.hat)}
if(!is.null(design)){tmp <- design;tmp[,c(1:n.batch)] <- 0;stand.mean <- stand.mean+t(tmp%*%B.hat)}
s.data <- (dat-stand.mean)/(sqrt(var.pooled)%*%t(rep(1,n.array)))

##Get regression batch effect parameters
cat("Fitting L/S model and finding priors\n")
message("Fitting L/S model and finding priors.")
batch.design <- design[,1:n.batch]
if (!NAs){
gamma.hat <- solve(t(batch.design)%*%batch.design)%*%t(batch.design)%*%t(as.matrix(s.data))
} else{
gamma.hat=apply(s.data,1,Beta.NA,batch.design)

}
delta.hat <- NULL
for (i in batches){
if(mean.only==TRUE){delta.hat <- rbind(delta.hat,rep(1,nrow(s.data)))}else{
delta.hat <- rbind(delta.hat,apply(s.data[,i], 1, var,na.rm=TRUE))
}
}

##Find Priors
gamma.bar <- apply(gamma.hat, 1, mean)
t2 <- apply(gamma.hat, 1, var)
a.prior <- apply(delta.hat, 1, aprior)
b.prior <- apply(delta.hat, 1, bprior)


##Plot empirical and parametric priors

if (prior.plots & par.prior){
par(mfrow=c(2,2))
tmp <- density(gamma.hat[1,])
Expand All @@ -166,12 +165,12 @@ ComBat <- function (dat, batch, mod = NULL, par.prior = TRUE, prior.plots = FALS
lines(c(0, max(invgam)), c(0, max(invgam)), col=2)
title("Q-Q Plot")
}

##Find EB batch adjustments

gamma.star <- delta.star <- matrix(NA, nrow=n.batch, ncol=nrow(s.data))
if (par.prior) {
cat("Finding parametric adjustments\n")
message("Finding parametric adjustments.")
results <- bplapply(1:n.batch, function(i) {
if (mean.only) {
gamma.star <- postmean(gamma.hat[i,], gamma.bar[i], 1, 1, t2[i])
Expand All @@ -192,7 +191,7 @@ ComBat <- function (dat, batch, mod = NULL, par.prior = TRUE, prior.plots = FALS
}
}
else {
cat("Finding nonparametric adjustments\n")
message("Finding nonparametric adjustments.")
results <- bplapply(1:n.batch, function(i) {
if (mean.only) {
delta.hat[i, ] = 1
Expand All @@ -206,32 +205,32 @@ ComBat <- function (dat, batch, mod = NULL, par.prior = TRUE, prior.plots = FALS
delta.star[i,] <- results[[i]]$delta.star
}
}

if(!is.null(ref.batch)){
gamma.star[ref,]=0 ## set reference batch mean equal to 0
delta.star[ref,]=1 ## set reference batch variance equal to 1
}

### Normalize the Data ###
cat("Adjusting the Data\n")
message("Adjusting the Data.")

bayesdata <- s.data
j <- 1
for (i in batches){
bayesdata[,i] <- (bayesdata[,i]-t(batch.design[i,]%*%gamma.star))/(sqrt(delta.star[j,])%*%t(rep(1,n.batches[j])))
j <- j+1
}

bayesdata <- (bayesdata*(sqrt(var.pooled)%*%t(rep(1,n.array))))+stand.mean

##### tiny change still exist when tested on bladder data
#### total sum of change within each batch around 1e-15
##### (could be computational system error).
#### total sum of change within each batch around 1e-15
##### (could be computational system error).
##### Do not change ref batch at all in reference version
if(!is.null(ref.batch)){
bayesdata[, batches[[ref]]] <- dat[, batches[[ref]]]
}

return(bayesdata)

}
29 changes: 14 additions & 15 deletions R/empirical.controls.R
Original file line number Diff line number Diff line change
@@ -1,38 +1,37 @@
#' A function for estimating the probability that each gene is an empirical control
#'
#' A function for estimating the probability that each gene is an empirical control
#'
#' This function uses the iteratively reweighted surrogate variable analysis approach
#' to estimate the probability that each gene is an empirical control.
#'
#' to estimate the probability that each gene is an empirical control.
#'
#' @param dat The transformed data matrix with the variables in rows and samples in columns
#' @param mod The model matrix being used to fit the data
#' @param mod0 The null model being compared when fitting the data
#' @param n.sv The number of surogate variables to estimate
#' @param B The number of iterations of the irwsva algorithm to perform
#' @param type If type is norm then standard irwsva is applied, if type is counts, then the moderated log transform is applied first
#'
#' @return pcontrol A vector of probabilites that each gene is a control.
#'
#' @examples
#'
#' @return pcontrol A vector of probabilites that each gene is a control.
#'
#' @examples
#' library(bladderbatch)
#' data(bladderdata)
#' dat <- bladderEset[1:5000,]
#'
#'
#' pheno = pData(dat)
#' edata = exprs(dat)
#' mod = model.matrix(~as.factor(cancer), data=pheno)
#'
#'
#' n.sv = num.sv(edata,mod,method="leek")
#' pcontrol <- empirical.controls(edata,mod,mod0=NULL,n.sv=n.sv,type="norm")
#'
#'
#' @export
#'

#'
empirical.controls <- function(dat, mod, mod0 = NULL,n.sv,B=5,type=c("norm","counts")) {
type = match.arg(type)
if((type=="counts") & any(dat < 0)){stop("empirical controls error: counts must be zero or greater")}
if(type=="counts"){dat = log(dat + 1)}
svobj = irwsva.build(dat, mod, mod0 = NULL,n.sv,B=5)
svobj = irwsva.build(dat, mod, mod0 = NULL,n.sv,B=5)
pcontrol = svobj$pprob.gam
return(pcontrol)

}
}
31 changes: 15 additions & 16 deletions R/f.pvalue.R
Original file line number Diff line number Diff line change
@@ -1,49 +1,48 @@
#' A function for quickly calculating f statistic p-values for use in sva
#'
#'
#' This function does simple linear algebra to calculate f-statistics
#' for each row of a data matrix comparing the nested models
#' defined by the design matrices for the alternative (mod) and and null (mod0) cases.
#' The columns of mod0 must be a subset of the columns of mod.
#'
#' The columns of mod0 must be a subset of the columns of mod.
#'
#' @param dat The transformed data matrix with the variables in rows and samples in columns
#' @param mod The model matrix being used to fit the data
#' @param mod0 The null model being compared when fitting the data
#'
#' @return p A vector of F-statistic p-values one for each row of dat.
#'
#' @examples
#'
#' @return p A vector of F-statistic p-values one for each row of dat.
#'
#' @examples
#' library(bladderbatch)
#' data(bladderdata)
#' dat <- bladderEset[1:50,]
#'
#'
#' pheno = pData(dat)
#' edata = exprs(dat)
#' mod = model.matrix(~as.factor(cancer), data=pheno)
#' mod0 = model.matrix(~1,data=pheno)
#'
#'
#' pValues = f.pvalue(edata,mod,mod0)
#' qValues = p.adjust(pValues,method="BH")
#'
#'
#' @export
#'

#'
f.pvalue <- function(dat,mod,mod0){
n <- dim(dat)[2]
m <- dim(dat)[1]
df1 <- dim(mod)[2]
df0 <- dim(mod0)[2]
p <- rep(0,m)
Id <- diag(n)

resid <- dat %*% (Id - mod %*% solve(t(mod) %*% mod) %*% t(mod))
rss1 <- rowSums(resid*resid)
rm(resid)

resid0 <- dat %*% (Id - mod0 %*% solve(t(mod0) %*% mod0) %*% t(mod0))
rss0 <- rowSums(resid0*resid0)
rm(resid0)

fstats <- ((rss0 - rss1)/(df1-df0))/(rss1/(n-df1))
p <- 1-pf(fstats,df1=(df1-df0),df2=(n-df1))
return(p)
}
}
Loading