Skip to content

Commit

Permalink
Merge pull request #1 from Jmendo12/dev
Browse files Browse the repository at this point in the history
Finished a working diversity divergence test.
  • Loading branch information
Jmendo12 authored Mar 4, 2019
2 parents ae33128 + 8c55557 commit 98784cf
Show file tree
Hide file tree
Showing 10 changed files with 340 additions and 558 deletions.
Binary file modified .RData
Binary file not shown.
583 changes: 83 additions & 500 deletions .Rhistory

Large diffs are not rendered by default.

Binary file modified results/dvdtresults.RData
Binary file not shown.
Binary file added results/dvdtsbresults.RData
Binary file not shown.
Binary file added results/llindivbetaresults.RData
Binary file not shown.
Binary file added results/llsharedbetaresults.RData
Binary file not shown.
54 changes: 54 additions & 0 deletions scripts/EVE-model-main.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# This is the main file in which to conduct tests, and calculate the resulting likelihood ratios and chi sqaured values.
# Author: Rori Rohlfs, Lars Gronvold, John Mendoza
# Date: 3/2/2019

source('./scripts/eve-io.R')
source('./scripts/dvdt.R')
source('./scripts/dvdtsb.R')

divergenceDiversityTest <- function()
{
# Initialize the tree
tree <- initializePhylogeny()

# Initialize the number of samples per species
num.indivs <- getIndividuals()

# Initialize the gene data
gene.data <- getExprData(num.indivs)

# Calculate the total likelihoods for the null and alternative hypothesis
ll.IndivBeta <- calculateLLIndivBeta(tree, num.indivs, gene.data)
ll.SharedBeta <- calculateLLsharedBeta(tree, num.indivs, gene.data)

# Calculate the likelihood ratios - the if statements are to prevent NaN values from being returned from the natural
# log calculations; these NaNs are returned if the argument passed in to log is negative
if( ll.IndivBeta < 0 )
{
lr <- -2 * log(-ll.IndivBeta) - (-2 * log(ll.SharedBeta))
}
else if(ll.SharedBeta < 0)
{
lr < - 2 * log(ll.IndivBeta) - (-2 * log(-ll.SharedBeta))
}
else
{
lr <- -2 * log(ll.IndivBeta) - (-2 * log(ll.SharedBeta))
}

# If the likelihood ratio is negative, make it positive so that caculating log(lr) does not return NaN
if(lr < 0)
{
lr <- -lr
}

# Calculate chi sqaured with one degree of freedom
chi.squared <- 2 * log(lr)

# Since R does not support returning multiple values from a function, save both the likelihood ration value and chi sqaured
# value to a results file. To view these results in your enviromnent call load("./results/dvdtresults.RData")
save(lr , chi.squared, file = "./results/dvdtresults.RData")

# Return the chi squared value
return(chi.squared)
}
79 changes: 21 additions & 58 deletions EVEmodel.R → scripts/dvdt.R
Original file line number Diff line number Diff line change
@@ -1,44 +1,13 @@
# This file contains functions to calculate gene likelihoods assuming an individual beta value between genes.
# To run the calculation as a whole execute the function calculateLLIndivBeta. The total likelihood result
# will be returned from the function. The full results will be stored to an .RData file; to view these results
# call load("./results/llindivbetaresults.RData").
# Author: Rori Rohlfs, Lars Gronvold, John Mendoza
# Date 2/25/19

#Include all our necessary libraries
library(ape)
library(mvtnorm)

## Phylogengy Implementation
# Initiliaze the phylogeny data and plot the phylogeny
initializePhylogeny <- function()
{
# Read in the filename the phylogeny is stored in
fileName <- paste("./data/", sep = "", readline(prompt = "Enter the file in which the phylogeny data is stored, and include the file extension: "))
# Create a tree from the file given by the user
tree <- read.tree(fileName)
# Reset the margins
par(mar = c(5, 4, 4, 2) + 0.1)
# Plot the tree so the edge, node numbers, and tip labels are shown
plot.phylo(tree, type = "p", show.tip.label = T, label.offset = .05, y.lim = c(.5, 5.5), no.margin = F)
nodelabels(text = 1 : (Ntip(tree) + Nnode(tree)), node = 1 : (Ntip(tree) + Nnode(tree)), frame = "circle")
edgelabels(frame = "none", col = "red", adj = c(.5, -.1))
axisPhylo(1)

return(tree)
}

# Use this function to get the data on the number of samples per species
getIndividuals <- function()
{
fileName <- paste("./data/", sep = "", readline(prompt = "Enter the file in which the number of samples per species data is stored, and include the file extension: "))
num.indivs <- scan(fileName, sep = " ")
return(num.indivs)
}

# Use this function to get the per gene expression data
getExprData <- function(num.indivs)
{
fileName <- paste("./data/", sep = "", readline(prompt = "Enter the file in which the gene expression data is stored, and include the file extension: "))
gene.data <- as.matrix(read.table("data/sampleExpr.dat",skip = 1,header = F,row.names = 1))

colnames(gene.data) <- rep(LETTERS[1:5], num.indivs)

return(gene.data)
}
source('./scripts/eve-io.R')

# Parameter implementation
# Use this function to get the per gene parameter matrix
Expand Down Expand Up @@ -158,7 +127,7 @@ expandECovMatrix <- function(expected.mean, covar.matrix, sigma.sqaured, alpha,

logLikOU <- function(param.matrix.row, tree, gene.data.row, num.indivs)
{
# Create vectors of paramters to pass into the function for calculating expression variance
# Create varaibles with paramters to pass into the function for calculating expression variance
theta <- param.matrix.row[1]
sigma.squared <- param.matrix.row[2]
alpha <- param.matrix.row[3]
Expand Down Expand Up @@ -191,38 +160,32 @@ calculateTotalLL <- function(ll.pergene)
return(llTotal)
}

divergence.diversity.test <- function()
# Test for divergence and diversity between genes within species assuming an individual beta value for each gene
calculateLLIndivBeta <- function(tree, num.indivs, gene.data)
{
#Initialize the tree
tree <- initializePhylogeny()

#Initialize the number of samples per species
num.indivs <- getIndividuals()

#Intialize the per gene data
gene.data <- getExprData(num.indivs)

#Calculate the per gene parameter matrix based on the gene data
param.matrix <- calculateParams(gene.data, num.indivs)

# Create a vector to hold the values of the likelihood per gene and a list to hold the return values from the call to optim
ll.pergene <- vector(mode = "numeric", length = nrow(param.matrix))
ll.pergene.IndivBeta <- vector(mode = "numeric", length = nrow(param.matrix))
max.params <- list()

# For each gene, optimize the parameters and store the resulting likelihood in the likelihood vector
for(row in 1:length(ll.pergene))
for(row in 1:length(ll.pergene.IndivBeta))
{
max.params <- optim(param.matrix[row, ], fn = calculateLLPerGene, gr = NULL, tree, gene.data[row, ], num.indivs,
method = "L-BFGS-B", lower = c(.001, .001))
ll.pergene[row] <- as.numeric(max.params[2])
ll.pergene.IndivBeta[row] <- as.numeric(max.params[2])
}

# Calculate the total likelihood as the product of all values within the likelihood vector
ll.total <- calculateTotalLL(ll.pergene)
ll.total.IndivBeta <- calculateTotalLL(ll.pergene.IndivBeta)

# Save the results to a file that can be loaded into any future R environment for future use
# To load this data simply call load("./results/dvdtresults.RData")
save(ll.pergene, ll.total, file = "./results/dvdtresults.RData")
# To load this data simply call load("./results/llindivbetaresults.RData")
save(ll.pergene.IndivBeta, ll.total.IndivBeta, file = "./results/llindivbetaresults.RData")

return(ll.total)
}
return(ll.total.IndivBeta)
}


137 changes: 137 additions & 0 deletions scripts/dvdtsb.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# This file contains functions to calculate gene likelihoods assuming a shared beta value between genes.
# To run the calculation as a whole execute the function calculateLLsharedBeta. The total likelihood result
# will be returned from the function. The full results will be stored to an .RData file; to view these results
# call load("./results/llsharedbetaresults.RData").
# Author: Rori Rohlfs, Lars Gronvold, John Mendoza
# Date 2/25/19

#Include all our necessary libraries
library(mvtnorm)
source('./scripts/eve-io.R')
source('./scripts/dvdt.R')

# This function calculates the initial parameters for the test where the beta paramater is assumed to be shared between genes
calculateParamsSharedBeta <- function(gene.data, num.indivs)
{
# Create the paramater matrix with the same number of rows as genes and one column for each parameter, and then name cols
param.matrix <- matrix(nrow = nrow(gene.data), ncol = 4)
colnames(param.matrix) <- c("theta", "sigma2", "alpha", "betaShared")

# Create a vector in which to store the mean value of data per species per row
species.mean <- vector(mode = "numeric", length = length(num.indivs))
# Create a matrix to store the variance within species for each species and each gene
species.var <- matrix(nrow = nrow(gene.data), ncol = length(num.indivs))
# Create a vector to store the mean of the variance within species for each gene
mean.species.var <- vector(mode = "numeric", length = nrow(species.var))

alpha <- .5

# For each row within the paramater matrix, fill each column with parameter values
for(row in 1:nrow(param.matrix))
{
# These start and end indices define where one species begins and ends for a gene
start <- 1
end <- num.indivs[1]

for(i in 1:length(num.indivs))
{
# This increments the end point for a species for a gene
if(i != 1)
{
end <- end + num.indivs[i]
}
# Calculate the mean of expression for each species and the variance of expression for each species
species.mean[i] <- mean(gene.data[row, start:end])
species.var[row, i] <- var(gene.data[row, start:end])
# This increments the beginning point of a species for a gene
start <- start + num.indivs[i]
}
# Fill the paramter matrix with theta, sigma sqaured, and alpha values. Compute the mean variance for each gene and store it
param.matrix[row, 1] <- mean(species.mean)
param.matrix[row, 2] <- var(species.mean)
param.matrix[row, 3] <- alpha
mean.species.var[row] <- mean(species.var[row,])

}

# Compute and store the shared beta value as the mean of mean variance for each species for each gene divided by the mean of
# sigma values
param.matrix[,4] <- mean(mean.species.var) / mean(param.matrix[,2])
return(param.matrix)
}

make.LogLikOU <- function(tree, gene.data.row, num.indivs, fixed = c(FALSE, FALSE, FALSE, FALSE))
{
params <- fixed

function(p)
{
# Use a boolean vector to determine which paramter to fix for the optimization
params[!fixed] <- p[!fixed]

# Create variables with paramters to pass into the function for calculating expression variance
theta <- params[1]
sigma.squared <- params[2]
alpha <- params[3]
beta <- params[4]

# Define the expected species mean and evolutionary variance for the root
expected.species.mean.root <- theta
evol.var.root <- sigma.squared / (2 * alpha)

expression.var <- calcExpVarOU(tree, theta, alpha, sigma.squared, evol.var.root, expected.species.mean.root)
covar.matrix <- calcCovMatOU(tree, alpha, expression.var$evol.variance)
expanded.matrix <- expandECovMatrix(expression.var$expected.mean, covar.matrix, sigma.squared, alpha, beta, num.indivs)

# Get log likelihood from the multivariate normal distribution density
ll <- dmvnorm(x = gene.data.row, mean = expanded.matrix$expected.mean, sigma = expanded.matrix$cov.matr, log = TRUE)
return(-ll)
}
}

# Test for divergence and diversity between genes within species assuming a shared beta value between genes
calculateLLsharedBeta <- function(tree, num.indivs, gene.data)
{
# Calculate the inital per gene parameter matrix based on the gene data; note that in this version of the function, beta is not
# calculated as we are under the assumption that beta is shared between genes
param.matrix <- calculateParamsSharedBeta(gene.data, num.indivs)

# Store the beta value in an independent variable and remove the beta column from the paramter matrix
#beta <- param.matrix[1,4]
#param.matrix <- param.matrix[, -4]

# Create a vector to hold the values of the likelihood per gene and a list to hold the return values from the call to optim
ll.pergene.sharedBeta <- vector(mode = "numeric", length = nrow(param.matrix))
max.params <- list()

# First, create an environment which contains the function to be optimized, and can hold all values other than beta fixed
# during the optimization
llOU <- make.LogLikOU(tree, gene.data[1, ], num.indivs,
c(param.matrix[1, 1], param.matrix[1, 2], param.matrix[1, 3], FALSE))
# Next, optimize the beta value for the first row of the initial paramters
max.params <- optim(param.matrix[1, ], fn = llOU, gr = NULL, method = "L-BFGS-B", lower = c(.001, .001))
# Store the result in a numerical vector, and then set the beta value for all genes as this result
vec <- as.numeric(max.params[[1]])
param.matrix[, 4] <- vec[4]

# For each gene, optimize the parameters keeping the shared beta fixed, and store the resulting likelihood in the likelihood vector
for(row in 1:length(ll.pergene.sharedBeta))
{
# Now, create an enviroment with the function to be optimized, and hold the shared beta value fixed
llOU <- make.LogLikOU(tree, gene.data[row, ], num.indivs,
c(FALSE, FALSE, FALSE, param.matrix[row, 4]))
# Store the results of the optimization in a list
max.params <- optim(param.matrix[row, ], fn = llOU, gr = NULL, method = "L-BFGS-B", lower = c(.001, .001))
# Store the likelihood calculation based on the optimized parameter in a vector
ll.pergene.sharedBeta[row] <- as.numeric(max.params[2])
}

# Calculate the total likelihood as the product of all values within the likelihood vector
ll.total.sharedBeta <- calculateTotalLL(ll.pergene.sharedBeta)

# Save the results to a file that can be loaded into any future R environment for future use
# To load this data simply call load("./results/llsharedbetaresults.RData")
save(ll.pergene.sharedBeta, ll.total.sharedBeta, file = "./results/llsharedbetaresults.RData")

return(ll.total.sharedBeta)
}
45 changes: 45 additions & 0 deletions scripts/eve-io.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# This file contains functions to read input from data files for the EVE-model tests. To include these functions in other files
# code source('eve-io.R') into the file you'd like to use these methods in.
# Author: Rori Rohlfs, Lars Gronvold, John Mendoza
# Data 2/25/19

#Include all our necessary libraries
library(ape)

## Phylogengy Implementation
# Initiliaze the phylogeny data and plot the phylogeny
initializePhylogeny <- function()
{
# Read in the filename the phylogeny is stored in
fileName <- paste("./data/", sep = "", readline(prompt = "Enter the file in which the phylogeny data is stored, and include the file extension: "))
# Create a tree from the file given by the user
tree <- read.tree(fileName)
# Reset the margins
par(mar = c(5, 4, 4, 2) + 0.1)
# Plot the tree so the edge, node numbers, and tip labels are shown
plot.phylo(tree, type = "p", show.tip.label = T, label.offset = .05, y.lim = c(.5, 5.5), no.margin = F)
nodelabels(text = 1 : (Ntip(tree) + Nnode(tree)), node = 1 : (Ntip(tree) + Nnode(tree)), frame = "circle")
edgelabels(frame = "none", col = "red", adj = c(.5, -.1))
axisPhylo(1)

return(tree)
}

# Use this function to get the data on the number of samples per species
getIndividuals <- function()
{
fileName <- paste("./data/", sep = "", readline(prompt = "Enter the file in which the number of samples per species data is stored, and include the file extension: "))
num.indivs <- scan(fileName, sep = " ")
return(num.indivs)
}

# Use this function to get the per gene expression data
getExprData <- function(num.indivs)
{
fileName <- paste("./data/", sep = "", readline(prompt = "Enter the file in which the gene expression data is stored, and include the file extension: "))
gene.data <- as.matrix(read.table("data/sampleExpr.dat",skip = 1,header = F,row.names = 1))

colnames(gene.data) <- rep(LETTERS[1:5], num.indivs)

return(gene.data)
}

0 comments on commit 98784cf

Please sign in to comment.