-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from Jmendo12/dev
Finished a working diversity divergence test.
- Loading branch information
Showing
10 changed files
with
340 additions
and
558 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# This is the main file in which to conduct tests, and calculate the resulting likelihood ratios and chi sqaured values. | ||
# Author: Rori Rohlfs, Lars Gronvold, John Mendoza | ||
# Date: 3/2/2019 | ||
|
||
source('./scripts/eve-io.R') | ||
source('./scripts/dvdt.R') | ||
source('./scripts/dvdtsb.R') | ||
|
||
divergenceDiversityTest <- function() | ||
{ | ||
# Initialize the tree | ||
tree <- initializePhylogeny() | ||
|
||
# Initialize the number of samples per species | ||
num.indivs <- getIndividuals() | ||
|
||
# Initialize the gene data | ||
gene.data <- getExprData(num.indivs) | ||
|
||
# Calculate the total likelihoods for the null and alternative hypothesis | ||
ll.IndivBeta <- calculateLLIndivBeta(tree, num.indivs, gene.data) | ||
ll.SharedBeta <- calculateLLsharedBeta(tree, num.indivs, gene.data) | ||
|
||
# Calculate the likelihood ratios - the if statements are to prevent NaN values from being returned from the natural | ||
# log calculations; these NaNs are returned if the argument passed in to log is negative | ||
if( ll.IndivBeta < 0 ) | ||
{ | ||
lr <- -2 * log(-ll.IndivBeta) - (-2 * log(ll.SharedBeta)) | ||
} | ||
else if(ll.SharedBeta < 0) | ||
{ | ||
lr < - 2 * log(ll.IndivBeta) - (-2 * log(-ll.SharedBeta)) | ||
} | ||
else | ||
{ | ||
lr <- -2 * log(ll.IndivBeta) - (-2 * log(ll.SharedBeta)) | ||
} | ||
|
||
# If the likelihood ratio is negative, make it positive so that caculating log(lr) does not return NaN | ||
if(lr < 0) | ||
{ | ||
lr <- -lr | ||
} | ||
|
||
# Calculate chi sqaured with one degree of freedom | ||
chi.squared <- 2 * log(lr) | ||
|
||
# Since R does not support returning multiple values from a function, save both the likelihood ration value and chi sqaured | ||
# value to a results file. To view these results in your enviromnent call load("./results/dvdtresults.RData") | ||
save(lr , chi.squared, file = "./results/dvdtresults.RData") | ||
|
||
# Return the chi squared value | ||
return(chi.squared) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
# This file contains functions to calculate gene likelihoods assuming a shared beta value between genes. | ||
# To run the calculation as a whole execute the function calculateLLsharedBeta. The total likelihood result | ||
# will be returned from the function. The full results will be stored to an .RData file; to view these results | ||
# call load("./results/llsharedbetaresults.RData"). | ||
# Author: Rori Rohlfs, Lars Gronvold, John Mendoza | ||
# Date 2/25/19 | ||
|
||
#Include all our necessary libraries | ||
library(mvtnorm) | ||
source('./scripts/eve-io.R') | ||
source('./scripts/dvdt.R') | ||
|
||
# This function calculates the initial parameters for the test where the beta paramater is assumed to be shared between genes | ||
calculateParamsSharedBeta <- function(gene.data, num.indivs) | ||
{ | ||
# Create the paramater matrix with the same number of rows as genes and one column for each parameter, and then name cols | ||
param.matrix <- matrix(nrow = nrow(gene.data), ncol = 4) | ||
colnames(param.matrix) <- c("theta", "sigma2", "alpha", "betaShared") | ||
|
||
# Create a vector in which to store the mean value of data per species per row | ||
species.mean <- vector(mode = "numeric", length = length(num.indivs)) | ||
# Create a matrix to store the variance within species for each species and each gene | ||
species.var <- matrix(nrow = nrow(gene.data), ncol = length(num.indivs)) | ||
# Create a vector to store the mean of the variance within species for each gene | ||
mean.species.var <- vector(mode = "numeric", length = nrow(species.var)) | ||
|
||
alpha <- .5 | ||
|
||
# For each row within the paramater matrix, fill each column with parameter values | ||
for(row in 1:nrow(param.matrix)) | ||
{ | ||
# These start and end indices define where one species begins and ends for a gene | ||
start <- 1 | ||
end <- num.indivs[1] | ||
|
||
for(i in 1:length(num.indivs)) | ||
{ | ||
# This increments the end point for a species for a gene | ||
if(i != 1) | ||
{ | ||
end <- end + num.indivs[i] | ||
} | ||
# Calculate the mean of expression for each species and the variance of expression for each species | ||
species.mean[i] <- mean(gene.data[row, start:end]) | ||
species.var[row, i] <- var(gene.data[row, start:end]) | ||
# This increments the beginning point of a species for a gene | ||
start <- start + num.indivs[i] | ||
} | ||
# Fill the paramter matrix with theta, sigma sqaured, and alpha values. Compute the mean variance for each gene and store it | ||
param.matrix[row, 1] <- mean(species.mean) | ||
param.matrix[row, 2] <- var(species.mean) | ||
param.matrix[row, 3] <- alpha | ||
mean.species.var[row] <- mean(species.var[row,]) | ||
|
||
} | ||
|
||
# Compute and store the shared beta value as the mean of mean variance for each species for each gene divided by the mean of | ||
# sigma values | ||
param.matrix[,4] <- mean(mean.species.var) / mean(param.matrix[,2]) | ||
return(param.matrix) | ||
} | ||
|
||
make.LogLikOU <- function(tree, gene.data.row, num.indivs, fixed = c(FALSE, FALSE, FALSE, FALSE)) | ||
{ | ||
params <- fixed | ||
|
||
function(p) | ||
{ | ||
# Use a boolean vector to determine which paramter to fix for the optimization | ||
params[!fixed] <- p[!fixed] | ||
|
||
# Create variables with paramters to pass into the function for calculating expression variance | ||
theta <- params[1] | ||
sigma.squared <- params[2] | ||
alpha <- params[3] | ||
beta <- params[4] | ||
|
||
# Define the expected species mean and evolutionary variance for the root | ||
expected.species.mean.root <- theta | ||
evol.var.root <- sigma.squared / (2 * alpha) | ||
|
||
expression.var <- calcExpVarOU(tree, theta, alpha, sigma.squared, evol.var.root, expected.species.mean.root) | ||
covar.matrix <- calcCovMatOU(tree, alpha, expression.var$evol.variance) | ||
expanded.matrix <- expandECovMatrix(expression.var$expected.mean, covar.matrix, sigma.squared, alpha, beta, num.indivs) | ||
|
||
# Get log likelihood from the multivariate normal distribution density | ||
ll <- dmvnorm(x = gene.data.row, mean = expanded.matrix$expected.mean, sigma = expanded.matrix$cov.matr, log = TRUE) | ||
return(-ll) | ||
} | ||
} | ||
|
||
# Test for divergence and diversity between genes within species assuming a shared beta value between genes | ||
calculateLLsharedBeta <- function(tree, num.indivs, gene.data) | ||
{ | ||
# Calculate the inital per gene parameter matrix based on the gene data; note that in this version of the function, beta is not | ||
# calculated as we are under the assumption that beta is shared between genes | ||
param.matrix <- calculateParamsSharedBeta(gene.data, num.indivs) | ||
|
||
# Store the beta value in an independent variable and remove the beta column from the paramter matrix | ||
#beta <- param.matrix[1,4] | ||
#param.matrix <- param.matrix[, -4] | ||
|
||
# Create a vector to hold the values of the likelihood per gene and a list to hold the return values from the call to optim | ||
ll.pergene.sharedBeta <- vector(mode = "numeric", length = nrow(param.matrix)) | ||
max.params <- list() | ||
|
||
# First, create an environment which contains the function to be optimized, and can hold all values other than beta fixed | ||
# during the optimization | ||
llOU <- make.LogLikOU(tree, gene.data[1, ], num.indivs, | ||
c(param.matrix[1, 1], param.matrix[1, 2], param.matrix[1, 3], FALSE)) | ||
# Next, optimize the beta value for the first row of the initial paramters | ||
max.params <- optim(param.matrix[1, ], fn = llOU, gr = NULL, method = "L-BFGS-B", lower = c(.001, .001)) | ||
# Store the result in a numerical vector, and then set the beta value for all genes as this result | ||
vec <- as.numeric(max.params[[1]]) | ||
param.matrix[, 4] <- vec[4] | ||
|
||
# For each gene, optimize the parameters keeping the shared beta fixed, and store the resulting likelihood in the likelihood vector | ||
for(row in 1:length(ll.pergene.sharedBeta)) | ||
{ | ||
# Now, create an enviroment with the function to be optimized, and hold the shared beta value fixed | ||
llOU <- make.LogLikOU(tree, gene.data[row, ], num.indivs, | ||
c(FALSE, FALSE, FALSE, param.matrix[row, 4])) | ||
# Store the results of the optimization in a list | ||
max.params <- optim(param.matrix[row, ], fn = llOU, gr = NULL, method = "L-BFGS-B", lower = c(.001, .001)) | ||
# Store the likelihood calculation based on the optimized parameter in a vector | ||
ll.pergene.sharedBeta[row] <- as.numeric(max.params[2]) | ||
} | ||
|
||
# Calculate the total likelihood as the product of all values within the likelihood vector | ||
ll.total.sharedBeta <- calculateTotalLL(ll.pergene.sharedBeta) | ||
|
||
# Save the results to a file that can be loaded into any future R environment for future use | ||
# To load this data simply call load("./results/llsharedbetaresults.RData") | ||
save(ll.pergene.sharedBeta, ll.total.sharedBeta, file = "./results/llsharedbetaresults.RData") | ||
|
||
return(ll.total.sharedBeta) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
# This file contains functions to read input from data files for the EVE-model tests. To include these functions in other files | ||
# code source('eve-io.R') into the file you'd like to use these methods in. | ||
# Author: Rori Rohlfs, Lars Gronvold, John Mendoza | ||
# Data 2/25/19 | ||
|
||
#Include all our necessary libraries | ||
library(ape) | ||
|
||
## Phylogengy Implementation | ||
# Initiliaze the phylogeny data and plot the phylogeny | ||
initializePhylogeny <- function() | ||
{ | ||
# Read in the filename the phylogeny is stored in | ||
fileName <- paste("./data/", sep = "", readline(prompt = "Enter the file in which the phylogeny data is stored, and include the file extension: ")) | ||
# Create a tree from the file given by the user | ||
tree <- read.tree(fileName) | ||
# Reset the margins | ||
par(mar = c(5, 4, 4, 2) + 0.1) | ||
# Plot the tree so the edge, node numbers, and tip labels are shown | ||
plot.phylo(tree, type = "p", show.tip.label = T, label.offset = .05, y.lim = c(.5, 5.5), no.margin = F) | ||
nodelabels(text = 1 : (Ntip(tree) + Nnode(tree)), node = 1 : (Ntip(tree) + Nnode(tree)), frame = "circle") | ||
edgelabels(frame = "none", col = "red", adj = c(.5, -.1)) | ||
axisPhylo(1) | ||
|
||
return(tree) | ||
} | ||
|
||
# Use this function to get the data on the number of samples per species | ||
getIndividuals <- function() | ||
{ | ||
fileName <- paste("./data/", sep = "", readline(prompt = "Enter the file in which the number of samples per species data is stored, and include the file extension: ")) | ||
num.indivs <- scan(fileName, sep = " ") | ||
return(num.indivs) | ||
} | ||
|
||
# Use this function to get the per gene expression data | ||
getExprData <- function(num.indivs) | ||
{ | ||
fileName <- paste("./data/", sep = "", readline(prompt = "Enter the file in which the gene expression data is stored, and include the file extension: ")) | ||
gene.data <- as.matrix(read.table("data/sampleExpr.dat",skip = 1,header = F,row.names = 1)) | ||
|
||
colnames(gene.data) <- rep(LETTERS[1:5], num.indivs) | ||
|
||
return(gene.data) | ||
} |