-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
dgrtwo
committed
Aug 22, 2014
0 parents
commit 4a2c7c2
Showing
22 changed files
with
876 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
Package: GSEAMA | ||
Type: Package | ||
Title: Gene set enrichment analysis made awesome | ||
Version: 0.99.0 | ||
Date: 2014-03-23 | ||
Author: David Robinson | ||
Maintainer: David Robinson <dgrtwo@princeton.edu> | ||
Description: Gene set enrichment analysis methods based on a membership matrix | ||
Depends: | ||
Matrix, | ||
data.table, | ||
ggplot2, | ||
qvalue, | ||
GO.db, | ||
glmnet | ||
VignetteBuilder: knitr | ||
Imports: | ||
Matrix, | ||
data.table, | ||
ggplot2, | ||
GO.db, | ||
AnnotationDbi | ||
Suggests: | ||
knitr, | ||
testthat | ||
URL: http://github.com/dgrtwo/GSEAMA | ||
License: MIT + file LICENSE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
export(CompareTopColumns) | ||
export(CompareY) | ||
export(GOMembershipMatrix) | ||
export(TestAssociation) | ||
import(AnnotationDbi) | ||
import(Matrix) | ||
import(data.table) | ||
importFrom(GO.db,GOTERM) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
################################## | ||
######## AllClasses.R | ||
######## | ||
######## all classes in GSEAMA | ||
################################## | ||
#################################################################### | ||
|
||
setClass("GeneMatrix", | ||
representation( | ||
matrix = "Matrix", | ||
colData = "data.table", | ||
geneData = "data.table", | ||
fit = "ANY", | ||
rankingMetric = "character" | ||
) | ||
) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
#' Create a violin plot, boxplot or histogram looking at the metric of interest | ||
#' over one or more columns | ||
#' | ||
#' This compares the metric of interest (y) across the given genes, as well as | ||
#' comparing it to the overall distribution. This shows a visual representation | ||
#' of how a column (a gene set, a motif, a transcription factor, etc) is associated | ||
#' with the metric. | ||
#' | ||
#' @param m a GeneMatrix | ||
#' @param columns The ID of one or more columns to plot | ||
#' @param mode Type of plot to create: either "violin", "boxplot" or "histogram" | ||
#' @param charlimit Maximum number of characters to include in column names | ||
#' | ||
#' @export | ||
CompareY = function(m, columns, mode="violin", charlimit=35) { | ||
# graph one or more columns | ||
# reverse the list (so that it reads top down instead of bottom up) | ||
columns = rev(columns) | ||
coldata = m@colData[match(columns, ID), ] | ||
|
||
y = rep(m@geneData$y, length(columns)) | ||
dat = data.frame(y=y, Present=c(as.matrix(m@matrix[, columns]) > 0)) | ||
|
||
# fix the terms so they are not too long | ||
terms = strtrim(coldata$Term, charlimit) | ||
terms[nchar(coldata$Term) > charlimit] = paste0( | ||
terms[nchar(coldata$Term) > charlimit], "...") | ||
dat$Set = rep(paste0(terms, " (", coldata$Count, ")"), each=NROW(m)) | ||
dat$Set = factor(dat$Set, levels=dat$Set[!duplicated(dat$Set)]) | ||
|
||
dat = dat[dat$Present, ] | ||
dat = rbind(dat, data.frame(y=m@geneData$y, Present=TRUE, Set="Overall")) | ||
dat$y = as.numeric(as.character(dat$y)) | ||
|
||
mode = match.arg(mode, c("violin", "boxplot", "histogram")) | ||
if (mode == "histogram") { | ||
g = ggplot(dat, aes(y)) + geom_histogram() + facet_wrap(~ Set, ncol=1, scale="free_y") | ||
} | ||
else { | ||
g = (ggplot(dat, aes(x=Set, y=y)) + coord_flip() + | ||
theme(axis.text.y=element_text(color="black", size=15))) | ||
if (mode == "violin") { | ||
g = g + geom_violin() | ||
} | ||
else { | ||
g = g + geom_boxplot() | ||
} | ||
} | ||
g | ||
} | ||
|
||
#' Create a graph comparing the top n columns (in terms of association with y) | ||
#' | ||
#' This compares the metric of interest (y) across the | ||
#' | ||
#' @param m GeneMatrix object | ||
#' @param n Number of genes to compare | ||
#' @param ... Additional arguments to be passed to CompareY | ||
#' | ||
#' @details Most methods ("t.test", "wilcoxon", "hypergeometric") use the | ||
#' p-value to find the most significant columns. The "lasso" uses the order | ||
#' in which the terms were added to the regression | ||
#' | ||
#' @export | ||
CompareTopColumns = function(m, n=9, ...) { | ||
if (length(m@rankingMetric) == 0) { | ||
stop("Cannot plot top genes without first running a test") | ||
} | ||
CompareY(m, m@colData[order(m@colData[[m@rankingMetric]]), ]$ID[1:n], | ||
...) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
#' Construct a new gene set membership matrix to use for performing enrichment | ||
#' analysis | ||
#' | ||
#' Construct a gene set membership matrix (one row per gene, one column per | ||
#' gene set) from a GO annotation map. | ||
#' | ||
#' @param annotations A Go3AnnDbBimap object, typically from a Bioconductor Annotation | ||
#' package: for example, org.Hs.egGO from org.Hs.eg.db | ||
#' @param ontology A vector of ontologies to include; should be a subset | ||
#' of c("BP", "MF", "CC") | ||
#' @param evidence A vector of GO evidence codes to include | ||
#' @param min.size Minimum size of a gene set to be included | ||
#' @param max.size Maximum size of a gene set to be included | ||
#' @param ancestors Whether a gene included in a gene set should also be included | ||
#' as being in all ancestors of that gene set | ||
#' | ||
#' The GO annotation maps typically come from the Bioconductor AnnotationDB packages. | ||
#' A couple of notable examples are: | ||
#' | ||
#' Homo sapiens: org.Hs.egGO | ||
#' S. cerevisiae: org.Sc.sgdGO | ||
#' E coli K12: org.EcK12.egGO | ||
#' | ||
#' You can restrict the sets to the BP (Biological Process), MF (Molecular Function), | ||
#' or CC (Cellular Compartment) ontologies (by default all are included). | ||
#' | ||
#' @import data.table Matrix AnnotationDbi | ||
#' @importFrom GO.db GOTERM | ||
#' | ||
#' @examples | ||
#' | ||
#' # yeast membership matrix | ||
#' library(org.Sc.sgd.db) | ||
#' mm = GOMembershipMatrix(org.Sc.sgdGO, min.size=5, max.size=250) | ||
#' | ||
#' # restrict to Biological Process ontology: | ||
#' mm = GOMembershipMatrix(org.Sc.sgdGO, ontology="BP", min.size=5, max.size=250) | ||
#' | ||
#' # human membership matrix | ||
#' library(org.Hs.eg.db) | ||
#' mm = GOMembershipMatrix(org.Sc.sgdGO, min.size=5, max.size=250) | ||
#' | ||
#' @export | ||
GOMembershipMatrix = function(annotations, ontology=NULL, evidence=NULL, | ||
min.size=0, max.size=1e6, ancestors=TRUE) { | ||
# create membership matrix | ||
frame.dt = data.table(toTable(annotations)) | ||
if (is.null(ontology)) { | ||
ontology = c("BP", "MF", "CC") | ||
} | ||
frame.dt = frame.dt[Ontology %in% ontology, ] | ||
|
||
if (!is.null(evidence)) { | ||
frame.dt = frame.dt[Evidence %in% evidence, ] | ||
} | ||
|
||
id.field = colnames(frame.dt)[1] | ||
frame.dt = frame.dt[!duplicated(frame.dt[, list(get(id.field), go_id)]), ] | ||
|
||
genes = sort(unique(frame.dt[[id.field]])) | ||
sets = sort(unique(frame.dt$go_id)) | ||
|
||
m = Matrix(0, nrow=length(genes), ncol=length(sets), | ||
dimnames=list(genes, sets)) | ||
m[cbind(match(frame.dt[[id.field]], genes), | ||
match(frame.dt$go_id, sets))] = 1 | ||
|
||
if (ancestors) { | ||
# include relationships recursively: if a gene is included in a set, | ||
# also include it in that set's ancestors | ||
offspring.tab = do.call(rbind, lapply(ontology, function(o) { | ||
toTable(get(paste0("GO", o, "OFFSPRING"))) | ||
})) | ||
offspring.indices = cbind(match(offspring.tab[, 1], sets), | ||
match(offspring.tab[, 2], sets)) | ||
# remove those that don't match anything already in our set | ||
offspring.indices = offspring.indices[complete.cases(offspring.indices), ] | ||
offspring.m = Matrix(0, nrow=length(sets), ncol=length(sets), | ||
dimnames=list(sets, sets)) | ||
offspring.m[offspring.indices] = 1 | ||
# combine with descendants | ||
m = m + m %*% offspring.m | ||
# turn back into binary matrix | ||
m = (m > 0) + 0 | ||
} | ||
|
||
cs = colSums(m) | ||
m = m[, cs >= min.size & cs <= max.size] | ||
|
||
# create table of set data, matching the membership matrix | ||
goterms = GOTERM[colnames(m)] | ||
setData = data.table(ID=colnames(m), Term=Term(goterms), | ||
Definition=Definition(goterms), Count=colSums(m)) | ||
|
||
# create table of gene data | ||
geneData = data.table(ID=rownames(m), Count=rowSums(m)) | ||
|
||
return(new("GeneMatrix", matrix=m, colData=setData, | ||
geneData=geneData)) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# methods for working with a GeneMatrix | ||
|
||
setMethod("dim", "GeneMatrix", function(x) dim(x@matrix)) | ||
setMethod("nrow", "GeneMatrix", function(x) nrow(x@matrix)) | ||
setMethod("ncol", "GeneMatrix", function(x) ncol(x@matrix)) | ||
|
||
setMethod("rownames", "GeneMatrix", function(x) rownames(x@matrix)) | ||
setMethod("colnames", "GeneMatrix", function(x) colnames(x@matrix)) | ||
|
||
setMethod("as.matrix", "GeneMatrix", function(x, ...) as.matrix(x@matrix)) | ||
|
||
setMethod("print", signature(x="GeneMatrix"), function(x) { | ||
cat(paste("Gene Matrix with", nrow(x), "genes and", ncol(x), "columns")) | ||
}) | ||
|
||
setMethod("[", c("GeneMatrix", "ANY", "ANY", "ANY"), | ||
function(x, i, j, ..., drop=TRUE) | ||
{ | ||
if (!is.null(x@fit)) { | ||
warning("Subsetting a model that has already been tested") | ||
} | ||
|
||
if (missing(i)) { | ||
i = 1:nrow(x@matrix) | ||
} | ||
if (missing(j)) { | ||
j = 1:ncol(x@matrix) | ||
} | ||
|
||
mat = x@matrix[i, j] | ||
if (length(i) == 1) { | ||
mat = Matrix(mat, nrow=1) | ||
} | ||
else if (length(j) == 1) { | ||
mat = Matrix(mat, ncol=1) | ||
} | ||
x@matrix = mat | ||
|
||
if (!is.character(i)) { | ||
x@geneData = x@geneData[i, ] | ||
} else { | ||
x@geneData = x@geneData[match(i, ID), ] | ||
} | ||
if (!is.character(j)) { | ||
x@colData = x@colData[j, ] | ||
} else { | ||
x@colData = x@colData[match(j, ID), ] | ||
} | ||
|
||
x@geneData$Count = rowSums(x@matrix != 0) | ||
x@colData$Count = colSums(x@matrix != 0) | ||
stopifnot(all(rownames(x@matrix) == x@geneData$ID)) | ||
stopifnot(all(colnames(x@matrix) == x@colData$ID)) | ||
x | ||
}) |
Oops, something went wrong.