Initial commit

dgrtwo · Aug 22, 2014 · 4a2c7c2 · 4a2c7c2
commit 4a2c7c2
Show file tree

Hide file tree

Showing 22 changed files with 876 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,27 @@
+Package: GSEAMA
+Type: Package
+Title: Gene set enrichment analysis made awesome
+Version: 0.99.0
+Date: 2014-03-23
+Author: David Robinson
+Maintainer: David Robinson <dgrtwo@princeton.edu>
+Description: Gene set enrichment analysis methods based on a membership matrix
+Depends:
+    Matrix,
+    data.table,
+    ggplot2,
+    qvalue,
+    GO.db,
+    glmnet
+VignetteBuilder: knitr
+Imports:
+    Matrix,
+    data.table,
+    ggplot2,
+    GO.db,
+    AnnotationDbi
+Suggests:
+    knitr,
+    testthat
+URL: http://github.com/dgrtwo/GSEAMA
+License: MIT + file LICENSE
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,8 @@
+export(CompareTopColumns)
+export(CompareY)
+export(GOMembershipMatrix)
+export(TestAssociation)
+import(AnnotationDbi)
+import(Matrix)
+import(data.table)
+importFrom(GO.db,GOTERM)
diff --git a/R/AllClasses.R b/R/AllClasses.R
@@ -0,0 +1,16 @@
+##################################
+######## AllClasses.R
+########
+########    all classes in GSEAMA
+##################################
+####################################################################
+
+setClass("GeneMatrix",
+    representation(
+        matrix = "Matrix",
+        colData = "data.table",
+        geneData = "data.table",
+        fit = "ANY",
+        rankingMetric = "character"
+    )
+)
diff --git a/R/AllGenerics.R b/R/AllGenerics.R
diff --git a/R/CompareY.R b/R/CompareY.R
@@ -0,0 +1,71 @@
+#' Create a violin plot, boxplot or histogram looking at the metric of interest
+#' over one or more columns
+#' 
+#' This compares the metric of interest (y) across the given genes, as well as
+#' comparing it to the overall distribution. This shows a visual representation
+#' of how a column (a gene set, a motif, a transcription factor, etc) is associated
+#' with the metric.
+#' 
+#' @param m a GeneMatrix
+#' @param columns The ID of one or more columns to plot
+#' @param mode Type of plot to create: either "violin", "boxplot" or "histogram"
+#' @param charlimit Maximum number of characters to include in column names
+#' 
+#' @export
+CompareY = function(m, columns, mode="violin", charlimit=35) {
+    # graph one or more columns 
+    # reverse the list (so that it reads top down instead of bottom up)
+    columns = rev(columns)
+    coldata = m@colData[match(columns, ID), ]
+
+    y = rep(m@geneData$y, length(columns))
+    dat = data.frame(y=y, Present=c(as.matrix(m@matrix[, columns]) > 0))
+
+    # fix the terms so they are not too long
+    terms = strtrim(coldata$Term, charlimit)
+    terms[nchar(coldata$Term) > charlimit] = paste0(
+        terms[nchar(coldata$Term) > charlimit], "...")
+    dat$Set = rep(paste0(terms, " (", coldata$Count, ")"), each=NROW(m))
+    dat$Set = factor(dat$Set, levels=dat$Set[!duplicated(dat$Set)])
+
+    dat = dat[dat$Present, ]
+    dat = rbind(dat, data.frame(y=m@geneData$y, Present=TRUE, Set="Overall"))
+    dat$y = as.numeric(as.character(dat$y))
+
+    mode = match.arg(mode, c("violin", "boxplot", "histogram"))
+    if (mode == "histogram") {
+        g = ggplot(dat, aes(y)) + geom_histogram() + facet_wrap(~ Set, ncol=1, scale="free_y")
+    }
+    else {
+        g = (ggplot(dat, aes(x=Set, y=y)) + coord_flip() +
+             theme(axis.text.y=element_text(color="black", size=15)))
+        if (mode == "violin") {
+            g = g + geom_violin()
+        }
+        else {
+            g = g + geom_boxplot()
+        }
+    }
+    g
+}
+
+#' Create a graph comparing the top n columns (in terms of association with y)
+#' 
+#' This compares the metric of interest (y) across the 
+#'
+#' @param m GeneMatrix object
+#' @param n Number of genes to compare
+#' @param ... Additional arguments to be passed to CompareY
+#' 
+#' @details Most methods ("t.test", "wilcoxon", "hypergeometric") use the
+#' p-value to find the most significant columns. The "lasso" uses the order
+#' in which the terms were added to the regression
+#' 
+#' @export
+CompareTopColumns = function(m, n=9, ...) {
+    if (length(m@rankingMetric) == 0) {
+        stop("Cannot plot top genes without first running a test")
+    }
+    CompareY(m, m@colData[order(m@colData[[m@rankingMetric]]), ]$ID[1:n],
+             ...)
+}
diff --git a/R/GOMembershipMatrix.R b/R/GOMembershipMatrix.R
@@ -0,0 +1,100 @@
+#' Construct a new gene set membership matrix to use for performing enrichment
+#' analysis
+#' 
+#' Construct a gene set membership matrix (one row per gene, one column per
+#' gene set) from a GO annotation map.
+#' 
+#' @param annotations A Go3AnnDbBimap object, typically from a Bioconductor Annotation
+#' package: for example, org.Hs.egGO from org.Hs.eg.db
+#' @param ontology A vector of ontologies to include; should be a subset
+#' of c("BP", "MF", "CC")
+#' @param evidence A vector of GO evidence codes to include
+#' @param min.size Minimum size of a gene set to be included
+#' @param max.size Maximum size of a gene set to be included
+#' @param ancestors Whether a gene included in a gene set should also be included
+#' as being in all ancestors of that gene set
+#' 
+#' The GO annotation maps typically come from the Bioconductor AnnotationDB packages.
+#' A couple of notable examples are:
+#' 
+#' Homo sapiens: org.Hs.egGO
+#' S. cerevisiae: org.Sc.sgdGO
+#' E coli K12: org.EcK12.egGO
+#' 
+#' You can restrict the sets to the BP (Biological Process), MF (Molecular Function),
+#' or CC (Cellular Compartment) ontologies (by default all are included).
+#' 
+#' @import data.table Matrix AnnotationDbi
+#' @importFrom GO.db GOTERM
+#' 
+#' @examples
+#' 
+#' # yeast membership matrix
+#' library(org.Sc.sgd.db)
+#' mm = GOMembershipMatrix(org.Sc.sgdGO, min.size=5, max.size=250)
+#' 
+#' # restrict to Biological Process ontology:
+#' mm = GOMembershipMatrix(org.Sc.sgdGO, ontology="BP", min.size=5, max.size=250)
+#' 
+#' # human membership matrix
+#' library(org.Hs.eg.db)
+#' mm = GOMembershipMatrix(org.Sc.sgdGO, min.size=5, max.size=250)
+#' 
+#' @export
+GOMembershipMatrix = function(annotations, ontology=NULL, evidence=NULL,
+                            min.size=0, max.size=1e6, ancestors=TRUE) {
+    # create membership matrix
+    frame.dt = data.table(toTable(annotations))
+    if (is.null(ontology)) {
+        ontology = c("BP", "MF", "CC")
+    }
+    frame.dt = frame.dt[Ontology %in% ontology, ]
+
+    if (!is.null(evidence)) {
+        frame.dt = frame.dt[Evidence %in% evidence, ]
+    }
+
+    id.field = colnames(frame.dt)[1]
+    frame.dt = frame.dt[!duplicated(frame.dt[, list(get(id.field), go_id)]), ]
+
+    genes = sort(unique(frame.dt[[id.field]]))
+    sets = sort(unique(frame.dt$go_id))
+
+    m = Matrix(0, nrow=length(genes), ncol=length(sets),
+               dimnames=list(genes, sets))
+    m[cbind(match(frame.dt[[id.field]], genes),
+            match(frame.dt$go_id, sets))] = 1
+
+    if (ancestors) {
+        # include relationships recursively: if a gene is included in a set,
+        # also include it in that set's ancestors
+        offspring.tab = do.call(rbind, lapply(ontology, function(o) {
+            toTable(get(paste0("GO", o, "OFFSPRING")))
+        }))
+        offspring.indices = cbind(match(offspring.tab[, 1], sets),
+                                  match(offspring.tab[, 2], sets))
+        # remove those that don't match anything already in our set
+        offspring.indices = offspring.indices[complete.cases(offspring.indices), ]
+        offspring.m = Matrix(0, nrow=length(sets), ncol=length(sets),
+                             dimnames=list(sets, sets))
+        offspring.m[offspring.indices] = 1
+        # combine with descendants
+        m = m + m %*% offspring.m
+        # turn back into binary matrix
+        m = (m > 0) + 0
+    }
+
+    cs = colSums(m)
+    m = m[, cs >= min.size & cs <= max.size]
+
+    # create table of set data, matching the membership matrix
+    goterms = GOTERM[colnames(m)]
+    setData = data.table(ID=colnames(m), Term=Term(goterms),
+                         Definition=Definition(goterms), Count=colSums(m))
+
+    # create table of gene data
+    geneData = data.table(ID=rownames(m), Count=rowSums(m))
+
+    return(new("GeneMatrix", matrix=m, colData=setData,
+               geneData=geneData))
+}
diff --git a/R/GeneMatrix-methods.R b/R/GeneMatrix-methods.R
@@ -0,0 +1,55 @@
+# methods for working with a GeneMatrix
+
+setMethod("dim", "GeneMatrix", function(x) dim(x@matrix))
+setMethod("nrow", "GeneMatrix", function(x) nrow(x@matrix))
+setMethod("ncol", "GeneMatrix", function(x) ncol(x@matrix))
+
+setMethod("rownames", "GeneMatrix", function(x) rownames(x@matrix))
+setMethod("colnames", "GeneMatrix", function(x) colnames(x@matrix))
+
+setMethod("as.matrix", "GeneMatrix", function(x, ...) as.matrix(x@matrix))
+
+setMethod("print", signature(x="GeneMatrix"), function(x) {
+    cat(paste("Gene Matrix with", nrow(x), "genes and", ncol(x), "columns"))
+})
+
+setMethod("[", c("GeneMatrix", "ANY", "ANY", "ANY"),
+          function(x, i, j, ..., drop=TRUE)
+          {
+              if (!is.null(x@fit)) {
+                  warning("Subsetting a model that has already been tested")        
+              }
+
+              if (missing(i)) {
+                  i = 1:nrow(x@matrix)
+              }
+              if (missing(j)) {
+                  j = 1:ncol(x@matrix)
+              }
+
+              mat = x@matrix[i, j]
+              if (length(i) == 1) {
+                  mat = Matrix(mat, nrow=1)
+              }
+              else if (length(j) == 1) {
+                  mat = Matrix(mat, ncol=1)
+              }
+              x@matrix = mat
+
+              if (!is.character(i)) {
+                  x@geneData = x@geneData[i, ]
+              } else {
+                  x@geneData = x@geneData[match(i, ID), ]
+              }
+              if (!is.character(j)) {
+                  x@colData = x@colData[j, ]
+              } else {
+                  x@colData = x@colData[match(j, ID), ]
+              }    
+
+              x@geneData$Count = rowSums(x@matrix != 0)
+              x@colData$Count = colSums(x@matrix != 0)
+              stopifnot(all(rownames(x@matrix) == x@geneData$ID))
+              stopifnot(all(colnames(x@matrix) == x@colData$ID))
+              x
+          })