diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 171c5b37..897cd5be 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -30,7 +30,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest] - r-version: [4.2] + r-version: [4.3] steps: diff --git a/.gitignore b/.gitignore index 974ca7d4..dca9e2e4 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,5 @@ tests/testthat/*.vcf vignettes/*.npy docs/*.npy docs/reference/*.npy +bladder.rdata +skin.rdata diff --git a/DESCRIPTION b/DESCRIPTION index 636e0c37..3b3f5e9a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: netZooR Type: Package Title: Unified methods for the inference and analysis of gene regulatory networks -Version: 1.5.4 -Date: 2023-09-25 +Version: 1.5.17 +Date: 2024-02-29 Authors@R: c(person("Marouen", "Ben Guebila", email = "benguebila@hsph.harvard.edu", role = c("aut","cre"), comment = c(ORCID = "0000-0001-5934-966X")), person("Tian", "Wang", @@ -22,11 +22,11 @@ Authors@R: c(person("Marouen", "Ben Guebila", Maintainer: Marouen Ben Guebila Description: netZooR unifies the implementations of several Network Zoo methods (netzoo, netzoo.github.io) into a single package by creating interfaces between network inference and network analysis methods. Currently, the package has 3 methods for network inference including PANDA and its optimized implementation OTTER (network reconstruction using mutliple lines of biological evidence), LIONESS (single-sample network inference), and EGRET (genotype-specific networks). Network analysis methods include CONDOR (community detection), ALPACA (differential community detection), CRANE (significance estimation of differential modules), MONSTER (estimation of network transition states). In addition, YARN allows to process gene expresssion data for tissue-specific analyses and SAMBAR infers missing mutation data based on pathway information. Depends: R (>= 4.2.0), - igraph, + igraph, reticulate, - yarn, pandaR, - matrixcalc + matrixcalc, + Biobase Remotes: stan-dev/cmdstanr, jnpaulson/pandaR, @@ -75,7 +75,15 @@ Imports: GeneNet, loo, rARPACK, - corpcor + corpcor, + biomaRt, + downloader, + edgeR, + limma, + preprocessCore, + readr, + RColorBrewer, + quantro License: GPL-3 Encoding: UTF-8 LazyData: false diff --git a/NAMESPACE b/NAMESPACE index 2a9bd8c0..a6f2033b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,9 @@ export(adj2regulon) export(alpaca) export(alpacaCrane) export(alpacaExtractTopGenes) +export(annotateFromBiomart) +export(checkMisAnnotation) +export(checkTissuesToMerge) export(cobra) export(condorCluster) export(condorCoreEnrich) @@ -17,10 +20,15 @@ export(craneBipartite) export(craneUnipartite) export(createCondorObject) export(createPandaStyle) +export(downloadGTEx) export(dragon) export(el2adj) export(el2regulon) export(elistToAdjMat) +export(filterGenes) +export(filterLowGenes) +export(filterMissingGenes) +export(filterSamples) export(lioness) export(lionessPy) export(monster) @@ -36,11 +44,15 @@ export(monsterTransformationMatrix) export(monsterTransitionNetworkPlot) export(monsterTransitionPCAPlot) export(monsterdTFIPlot) +export(normalizeTissueAware) export(otter) export(pandaDiffEdges) export(pandaPy) export(pandaToAlpaca) export(pandaToCondorObject) +export(plotCMDS) +export(plotDensity) +export(plotHeatmap) export(priorPp) export(puma) export(runEgret) @@ -73,12 +85,33 @@ import(reticulate) import(stats, except= c(cov2cor,decompose,toeplitz,lowess,update,spectrum)) import(vegan, except=diversity) import(viridisLite) -import(yarn) +importClassesFrom(Biobase,ExpressionSet) +importClassesFrom(Biobase,eSet) +importFrom(Biobase,"assayData<-") +importFrom(Biobase,"fData<-") +importFrom(Biobase,"pData<-") +importFrom(Biobase,"phenoData<-") +importFrom(Biobase,"storageMode<-") +importFrom(Biobase,AnnotatedDataFrame) +importFrom(Biobase,ExpressionSet) importFrom(Biobase,assayData) +importFrom(Biobase,exprs) +importFrom(Biobase,fData) +importFrom(Biobase,featureNames) +importFrom(Biobase,pData) +importFrom(Biobase,storageMode) +importFrom(RColorBrewer,brewer.pal) importFrom(assertthat,assert_that) +importFrom(biomaRt,getBM) +importFrom(biomaRt,useMart) +importFrom(downloader,download) +importFrom(edgeR,cpm) +importFrom(gplots,heatmap.2) +importFrom(graphics,abline) importFrom(graphics,axis) importFrom(graphics,box) importFrom(graphics,hist) +importFrom(graphics,legend) importFrom(graphics,mtext) importFrom(graphics,par) importFrom(graphics,plot) @@ -90,6 +123,7 @@ importFrom(igraph,E) importFrom(igraph,V) importFrom(igraph,graph.data.frame) importFrom(igraph,plot.igraph) +importFrom(limma,normalizeQuantiles) importFrom(matrixStats,colSds) importFrom(matrixStats,rowSds) importFrom(methods,is) @@ -99,8 +133,18 @@ importFrom(parallel,mclapply) importFrom(penalized,optL1) importFrom(penalized,penalized) importFrom(penalized,predict) +importFrom(preprocessCore,normalize.quantiles) +importFrom(quantro,matdensity) +importFrom(readr,problems) +importFrom(readr,read_tsv) importFrom(reshape,melt.array) importFrom(reshape2,dcast) importFrom(reshape2,melt) +importFrom(stats,ave) +importFrom(stats,cmdscale) +importFrom(stats,dist) +importFrom(stats,model.matrix) +importFrom(stats,runmed) +importFrom(stats,sd) importFrom(tidyr,spread) importFrom(utils,write.table) diff --git a/R/ALPACA.R b/R/ALPACA.R index f0e51007..4c133de1 100644 --- a/R/ALPACA.R +++ b/R/ALPACA.R @@ -16,7 +16,6 @@ #' @importFrom utils write.table #' @rawNamespace import(GOstats, except= makeGOGraph) #' @import org.Hs.eg.db -#' @import yarn #' @export #' diff --git a/R/DRAGON.R b/R/DRAGON.R index aa84fa13..1cbc7743 100644 --- a/R/DRAGON.R +++ b/R/DRAGON.R @@ -5,6 +5,7 @@ # X_mean = np.mean(X_temp, axis=0) # return (X_temp - X_mean) / X_std + scale = function(x,bias=FALSE) { # sd does 1/(n-1), python does 1/n @@ -234,7 +235,7 @@ estimatePenaltyParameters = function(X1,X2) lower=c(0,0), upper=c(1,1), control = list(trace=TRUE,pgtol = 1e-15)) - + # reparameterize lambdas = c(1-res$par[1]^2, 1-res$par[2]^2) return(list("lambdas"=lambdas,"gammas"=res$par,"optim_result"=res,"risk_grid" = riskgrid)) @@ -296,12 +297,13 @@ get_precision_matrix_dragon = function(X1, X2, lambdas) # mu = np.mean(X, axis=0) } + get_partial_correlation_from_precision = function(Theta,selfEdges=FALSE) { # by default, does not return self edges (diagonal is set to zero) ggm = -cov2cor(Theta) if(!selfEdges) - ggm[diag(ggm)] = 0 + diag(ggm) = 0 return(ggm) } diff --git a/R/MONSTER.R b/R/MONSTER.R index 2efe83e5..5d5c5514 100644 --- a/R/MONSTER.R +++ b/R/MONSTER.R @@ -90,7 +90,8 @@ monsterPrintMonsterAnalysis <- function(x, ...){ #' to give to indirect compared to direct evidence. The default is 0.5 to give an #' equal weight to direct and indirect evidence. #' @param mode A parameter telling whether to build the regulatory networks ('buildNet') or to use provided regulatory networks -#' ('regNet'). If set to 'regNet', then the parameters motif, ni_method, ni.coefficient.cutoff, and alphaw will be set to NA. +#' ('regNet'). If set to 'regNet', then the parameters motif, ni_method, ni.coefficient.cutoff, and alphaw will be set to NA. Gene regulatory +#' networks are supplied in the 'expr' variable as a TF-by-Gene matrix, by concatenating the TF-by-Gene matrices of case and control, expr has size nTFs x 2nGenes. #' @export #' @import doParallel #' @import parallel diff --git a/R/PUMA.R b/R/PUMA.R index a30d4b99..6831027d 100644 --- a/R/PUMA.R +++ b/R/PUMA.R @@ -89,7 +89,11 @@ puma <- function(motif,expr=NULL,ppi=NULL,alpha=0.1,mir_file,hamming=0.001, expr <- expr[order(rownames(expr)),] }else if(mode=='union'){ gene.names=unique(union(rownames(expr),unique(motif[,2]))) - tf.names =unique(union(unique(ppi[,1]),unique(motif[,1]))) + if(is.null(ppi)){ + tf.names = unique(motif[,1]) + }else{ + tf.names =unique(union(unique(ppi[,1]),unique(motif[,1]))) + } num.TFs <- length(tf.names) num.genes <- length(gene.names) # gene expression matrix @@ -117,7 +121,11 @@ puma <- function(motif,expr=NULL,ppi=NULL,alpha=0.1,mir_file,hamming=0.001, regulatoryNetwork[Idx]=motif[,3] }else if(mode=='intersection'){ gene.names=unique(intersect(rownames(expr),unique(motif[,2]))) - tf.names =unique(intersect(unique(ppi[,1]),unique(motif[,1]))) + if(is.null(ppi)){ + tf.names =unique(motif[,1]) + }else{ + tf.names =unique(intersect(unique(ppi[,1]),unique(motif[,1]))) + } num.TFs <- length(tf.names) num.genes <- length(gene.names) # gene expression matrix @@ -241,6 +249,7 @@ puma <- function(motif,expr=NULL,ppi=NULL,alpha=0.1,mir_file,hamming=0.001, if(!is.null(mir_file)){ mirIndex = match(mir_file,tf.names) + mirIndex <- mirIndex[!is.na(mirIndex)] tfCoopNetwork[mirIndex,] = 0 tfCoopNetwork[,mirIndex] = 0 seqs = seq(1, num.TFs*num.TFs, num.TFs+1) diff --git a/R/YARN.R b/R/YARN.R new file mode 100644 index 00000000..84fae906 --- /dev/null +++ b/R/YARN.R @@ -0,0 +1,771 @@ +#' Annotate your Expression Set with biomaRt +#' +#' @param obj ExpressionSet object. +#' @param genes Genes or rownames of the ExpressionSet. +#' @param filters getBM filter value, see getBM help file. +#' @param attributes getBM attributes value, see getBM help file. +#' @param biomart BioMart database name you want to connect to. Possible database names can be retrieved with teh function listMarts. +#' @param dataset Dataset you want to use. To see the different datasets available within a biomaRt you can e.g. do: mart = useMart('ensembl'), followed by listDatasets(mart). +#' @param ... Values for useMart, see useMart help file. +#' +#' @return ExpressionSet object with a fuller featureData. +#' @export +#' +#' @importFrom biomaRt useMart +#' @importFrom biomaRt getBM +#' @importFrom Biobase featureNames +#' @importFrom Biobase fData +#' @importFrom Biobase fData<- +#' @importFrom Biobase ExpressionSet +#' @importClassesFrom Biobase ExpressionSet +#' +#' @examples +#' download.file('https://netzoo.s3.us-east-2.amazonaws.com/netZooR/unittest_datasets/yarn/bladder.rdata',destfile='netZooR/data/bladder.rdata') +#' download.file('https://netzoo.s3.us-east-2.amazonaws.com/netZooR/unittest_datasets/yarn/skin.rdata',destfile='netZooR/data/skin.rdata') +#' data(skin) +#' # subsetting and changing column name just for a silly example +#' skin <- skin[1:10,] +#' colnames(fData(skin)) = paste("names",1:6) +#' biomart<-"ENSEMBL_MART_ENSEMBL"; +#' genes <- sapply(strsplit(rownames(skin),split="\\."),function(i)i[1]) +#' newskin <-annotateFromBiomart(skin,genes=genes,biomart=biomart) +#' head(fData(newskin)[,7:11]) +#' +annotateFromBiomart <- function(obj,genes=featureNames(obj),filters="ensembl_gene_id", + attributes=c("ensembl_gene_id","hgnc_symbol","chromosome_name","start_position","end_position"), + biomart="ensembl",dataset="hsapiens_gene_ensembl",...){ + mart <- useMart(biomart=biomart,dataset=dataset,...) + anno <- getBM(attributes = attributes, filters = filters, + values = genes, mart = mart) + if(nrow(anno)length(genes)){ + warning(sprintf("getBM returned more rows than genes queried. Using first call of %s.",colnames(anno)[1])) + throw = which(duplicated(anno[,1])) + anno = anno[-throw,] + } + anno = anno[match(genes,anno[,"ensembl_gene_id"]),] + if(!is.null(fData(obj))) anno = cbind(fData(obj),anno) + fData(obj) = anno + obj +} + +#' Check for wrong annotation of a sample using classical MDS and control genes. +#' +#' @param obj ExpressionSet object. +#' @param phenotype phenotype column name in the phenoData slot to check. +#' @param controlGenes Name of controlGenes, ie. 'Y' chromosome. Can specify 'all'. +#' @param columnID Column name where controlGenes is defined in the featureData slot if other than 'all'. +#' @param plotFlag TRUE/FALSE Whether to plot or not +#' @param legendPosition Location for the legend. +#' @param ... Extra parameters for \code{\link{plotCMDS}} function. +#' +#' @importFrom graphics legend +#' +#' @return Plots a classical multi-dimensional scaling of the 'controlGenes'. Optionally returns co-ordinates. +#' @export +#' +#' @examples +#' data(bladder) +#' checkMisAnnotation(bladder,'GENDER',controlGenes='Y',legendPosition='topleft') +#' +checkMisAnnotation <- function(obj, phenotype, controlGenes = "all", + columnID = "chromosome_name", plotFlag = TRUE, legendPosition = NULL, + ...) { + if (tolower(controlGenes) != "all") { + obj <- filterGenes(obj, labels = controlGenes, featureName = columnID, + keepOnly = TRUE) + } + if (length(phenotype) == 1) { + phenotype <- factor(pData(obj)[, phenotype]) + } + res <- plotCMDS(obj, pch = 21, bg = phenotype, plotFlag = plotFlag, + ...) + if (!is.null(legendPosition)) + legend(legendPosition, legend = levels(phenotype), fill = 1:length(levels(phenotype))) + invisible(res) +} + +#' Check tissues to merge based on gene expression profile +#' +#' @param obj ExpressionSet object. +#' @param majorGroups Column name in the phenoData slot that describes the general body region or site of the sample. +#' @param minorGroups Column name in the phenoData slot that describes the specific body region or site of the sample. +#' @param filterFun Filter group specific genes that might disrupt PCoA analysis. +#' @param plotFlag TRUE/FALSE whether to plot or not +#' @param ... Parameters that can go to \code{\link[yarn]{checkMisAnnotation}} +#' +#' @return CMDS Plots of the majorGroupss colored by the minorGroupss. Optional matrix of CMDS loadings for each comparison. +#' @export +#' +#' @seealso checkTissuesToMerge +#' +#' @examples +#' data(skin) +#' checkTissuesToMerge(skin,'SMTS','SMTSD') +#' +checkTissuesToMerge <- function(obj, majorGroups, minorGroups, + filterFun = NULL, plotFlag = TRUE, ...) { + if (length(majorGroups) == 1) { + region <- factor(pData(obj)[, majorGroups]) + } else { + region <- factor(majorGroups) + } + if (!is.null(filterFun)) { + obj <- filterFun(obj) + } + result <- lapply(levels(region), function(i) { + keepSamples <- which(region == i) + objSubset <- obj[, keepSamples] + objSubset <- objSubset[which(rowSums(exprs(objSubset)) > 0), ] + res <- checkMisAnnotation(objSubset, phenotype = minorGroups, + controlGenes = "all", plotFlag = plotFlag, main = i, + ...) + res + }) + invisible(result) +} + +#' Download GTEx files and turn them into ExpressionSet object +#' +#' Downloads the V6 GTEx release and turns it into an ExpressionSet object. +#' +#' @param type Type of counts to download - default genes. +#' @param file File path and name to automatically save the downloaded GTEx expression set. Saves as a RDS file. +#' @param ... Does nothing currently. +#' +#' @return Organized ExpressionSet set. +#' @export +#' +#' @importFrom downloader download +#' @importFrom readr read_tsv +#' @importFrom readr problems +#' @importFrom Biobase AnnotatedDataFrame +#' @importFrom Biobase phenoData<- +#' @importFrom Biobase pData<- +#' +#' @examples +#' # obj <- downloadGTEx(type='genes',file='~/Desktop/gtex.rds') +downloadGTEx <- function(type = "genes", file = NULL, ...) { + phenoFile <- "http://www.gtexportal.org/static/datasets/gtex_analysis_v6/annotations/GTEx_Data_V6_Annotations_SampleAttributesDS.txt" + pheno2File <- "http://www.gtexportal.org/static/datasets/gtex_analysis_v6/annotations/GTEx_Data_V6_Annotations_SubjectPhenotypesDS.txt" + geneFile <- "http://www.gtexportal.org/static/datasets/gtex_analysis_v6/rna_seq_data/GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_reads.gct.gz" + + message("Downloading and reading files") + pdFile <- tempfile("phenodat", fileext = ".txt") + download(phenoFile, destfile = pdFile) + pd <- read_tsv(pdFile) + pd <- as.matrix(pd) + rownames(pd) <- pd[, "SAMPID"] + ids <- sapply(strsplit(pd[, "SAMPID"], "-"), function(i) paste(i[1:2], + collapse = "-")) + + pd2File <- tempfile("phenodat2", fileext = ".txt") + download(pheno2File, destfile = pd2File) + pd2 <- read_tsv(pd2File) + pd2 <- as.matrix(pd2) + rownames(pd2) <- pd2[, "SUBJID"] + pd2 <- pd2[which(rownames(pd2) %in% unique(ids)), ] + pd2 <- pd2[match(ids, rownames(pd2)), ] + rownames(pd2) <- colnames(counts) + + pdfinal <- AnnotatedDataFrame(data.frame(cbind(pd, pd2))) + + if (type == "genes") { + countsFile <- tempfile("counts", fileext = ".gz") + download(geneFile, destfile = countsFile) + cnts <- suppressWarnings(read_tsv(geneFile, skip = 2)) + genes <- unlist(cnts[, 1]) + geneNames <- unlist(cnts[, 2]) + counts <- cnts[, -c(1:2)] + counts <- as.matrix(counts) + rownames(counts) <- genes + for (i in 1:nrow(problems(cnts))) { + counts[problems(cnts)$row[i], problems(cnts)$col[i]] <- 1e+05 + } + throwAway <- which(rowSums(counts) == 0) + counts <- counts[-throwAway, ] + genes <- sub("\\..*", "", rownames(counts)) + + host <- "dec2013.archive.ensembl.org" + biomart <- "ENSEMBL_MART_ENSEMBL" + dataset <- "hsapiens_gene_ensembl" + attributes <- c("ensembl_gene_id", "hgnc_symbol", "chromosome_name", + "start_position", "end_position", "gene_biotype") + } + + message("Creating ExpressionSet") + pdfinal <- pdfinal[match(colnames(counts), rownames(pdfinal)), + ] + es <- ExpressionSet(as.matrix(counts)) + phenoData(es) <- pdfinal + pData(es)["GTEX-YF7O-2326-101833-SM-5CVN9", "SMTS"] <- "Skin" + pData(es)["GTEX-YEC3-1426-101806-SM-5PNXX", "SMTS"] <- "Stomach" + + message("Annotating from biomaRt") + es <- annotateFromBiomart(obj = es, genes = genes, host = host, + biomart = biomart, dataset = dataset, attributes = attributes) + + message("Cleaning up files") + unlink(pdFile) + unlink(pd2File) + unlink(countsFile) + + if (!is.null(file)) + saveRDS(es, file = file) + return(es) +} + +#' Extract the appropriate matrix +#' +#' This returns the raw counts, log2-transformed raw counts, or normalized expression. +#' If normalized = TRUE then the log paramater is ignored. +#' +#' @param obj ExpressionSet object or objrix. +#' @param normalized TRUE / FALSE, use the normalized matrix or raw counts +#' @param log TRUE/FALSE log2-transform. +#' +#' @importFrom Biobase assayData +#' @return matrix +#' @examples +#' +#' data(skin) +#' head(netZooR:::extractMatrix(skin,normalized=FALSE,log=TRUE)) +#' head(netZooR:::extractMatrix(skin,normalized=FALSE,log=FALSE)) +#' +extractMatrix <- function(obj, normalized = FALSE, log = TRUE) { + if (class(obj) == "ExpressionSet") { + if (!normalized) { + obj <- exprs(obj) + } else { + if (!"normalizedMatrix" %in% names(assayData(obj))) + stop("normalizedMatrix assayData missing") + obj <- assayData(obj)[["normalizedMatrix"]] + if (log & normalized) + message("normalizedMatrix is assumed to already be log-transformed") + log <- FALSE + } + } + if (log == TRUE) { + obj <- log2(obj + 1) + } + obj +} + +#' Filter specific genes +#' +#' The main use case for this function is the removal of sex-chromosome genes. +#' Alternatively, filter genes that are not protein-coding. +#' +#' @param obj ExpressionSet object. +#' @param labels Labels of genes to filter or keep, eg. X, Y, and MT +#' @param featureName FeatureData column name, eg. chr +#' @param keepOnly Filter or keep only the genes with those labels +#' +#' @return Filtered ExpressionSet object +#' @export +#' +#' @importFrom Biobase exprs +#' @importFrom Biobase fData +#' +#' @examples +#' data(skin) +#' filterGenes(skin,labels = c('X','Y','MT'),featureName='chromosome_name') +#' filterGenes(skin,labels = 'protein_coding',featureName='gene_biotype',keepOnly=TRUE) +#' +filterGenes <- function(obj, labels = c("X", "Y", "MT"), featureName = "chromosome_name", + keepOnly = FALSE) { + features <- fData(obj)[, featureName] + if (keepOnly == FALSE) { + throwAwayGenes <- which(features %in% labels) + } else { + throwAwayGenes <- which(!features %in% labels) + } + obj <- obj[-throwAwayGenes, ] + obj +} + +#' Filter genes that have less than a minimum threshold CPM for a given group/tissue +#' +#' @param obj ExpressionSet object. +#' @param groups Vector of labels for each sample or a column name of the phenoData slot. +#' for the ids to filter. Default is the column names. +#' @param threshold The minimum threshold for calling presence of a gene in a sample. +#' @param minSamples Minimum number of samples - defaults to half the minimum group size. +#' @param ... Options for \link[edgeR]{cpm}. +#' @seealso \link[edgeR]{cpm} function defined in the edgeR package. +#' +#' @return Filtered ExpressionSet object +#' @export +#' +#' @importFrom edgeR cpm +#' @importFrom Biobase exprs +#' @importFrom Biobase pData +#' +#' @examples +#' data(skin) +#' filterLowGenes(skin,'SMTSD') +#' +filterLowGenes <- function(obj, groups, threshold = 1, minSamples = NULL, + ...) { + if (is.null(minSamples)) { + if (length(groups) == 1) { + minSamples <- min(table(pData(obj)[, groups]))/2 + } else { + minSamples <- min(table(groups))/2 + } + } + counts <- cpm(exprs(obj), ...) + keep <- rowSums(counts > threshold) >= minSamples + obj <- obj[keep, ] + obj +} + +#' Filter genes not expressed in any sample +#' +#' The main use case for this function is the removal of missing genes. +#' +#' @param obj ExpressionSet object. +#' @param threshold Minimum sum of gene counts across samples -- defaults to zero. +#' +#' @return Filtered ExpressionSet object +#' @export +#' +#' @importFrom Biobase exprs +#' @importFrom Biobase fData +#' +#' @examples +#' data(skin) +#' filterMissingGenes(skin) +#' +filterMissingGenes <- function(obj, threshold = 0) { + sumGenes <- rowSums(exprs(obj)) + throwAwayGenes <- which(sumGenes <= threshold) + if (length(which(sumGenes <= 0)) > 0) { + obj <- obj[-throwAwayGenes, ] + } + obj +} + +#' Filter samples +#' +#' @param obj ExpressionSet object. +#' @param ids Names found within the groups labels corresponding to samples to be removed +#' @param groups Vector of labels for each sample or a column name of the phenoData slot +#' for the ids to filter. Default is the column names. +#' @param keepOnly Filter or keep only the samples with those labels. +#' +#' @return Filtered ExpressionSet object +#' @export +#' +#' @importFrom Biobase pData +#' +#' @examples +#' data(skin) +#' filterSamples(skin,ids = "Skin - Not Sun Exposed (Suprapubic)",groups="SMTSD") +#' filterSamples(skin,ids=c("GTEX-OHPL-0008-SM-4E3I9","GTEX-145MN-1526-SM-5SI9T")) +#' +filterSamples <- function(obj, ids, groups = colnames(obj), keepOnly = FALSE) { + if (length(groups) == 1) { + groups <- pData(obj)[, groups] + } + throwAway <- which(groups %in% ids) + if (keepOnly) { + obj <- obj[, throwAway] + } else { + obj <- obj[, -throwAway] + } + obj +} + +#' Normalize in a tissue aware context +#' +#' This function provides a wrapper to various normalization methods developed. +#' Currently it only wraps qsmooth and quantile normalization returning a log-transformed +#' normalized matrix. qsmooth is a normalization approach that normalizes samples in +#' a condition aware manner. +#' +#' @param obj ExpressionSet object +#' @param groups Vector of labels for each sample or a column name of the phenoData slot +#' for the ids to filter. Default is the column names +#' @param normalizationMethod Choice of 'qsmooth' or 'quantile' +#' @param ... Options for \code{\link{qsmooth}} function or \code{\link[limma]{normalizeQuantiles}} +#' +#' @return ExpressionSet object with an assayData called normalizedMatrix +#' @export +#' +#' @source The function qsmooth comes from the qsmooth packages +#' currently available on github under user 'kokrah'. +#' +#' @importFrom limma normalizeQuantiles +#' @importFrom Biobase storageMode +#' @importFrom Biobase storageMode<- +#' @importFrom Biobase assayData +#' @importFrom Biobase assayData<- +#' @importFrom preprocessCore normalize.quantiles +#' @importClassesFrom Biobase eSet +#' @importClassesFrom Biobase ExpressionSet +#' +#' @examples +#' data(skin) +#' normalizeTissueAware(skin,"SMTSD") +#' +normalizeTissueAware <- function(obj, groups, normalizationMethod = c("qsmooth", + "quantile"), ...) { + normalizationMethod <- match.arg(normalizationMethod) + if (length(groups) == 1) { + groups <- factor(pData(obj)[, groups]) + } + storageMode(obj) <- "environment" + if (normalizationMethod == "qsmooth") { + normalizedMatrix <- qsmooth(obj, groups = groups, + ...) + } else if (normalizationMethod == "quantile") { + if(length(unique(groups))>1){ + normalizedMatrix <- sapply(unique(groups), function(i) { + cnts <- exprs(obj[, which(pData(obj)$our %in% i)]) + nmat <- normalize.quantiles(cnts) + colnames(nmat) <- colnames(cnts) + nmat + }) + normalizedMatrix <- Reduce("cbind", normalizedMatrix) + normalizedMatrix <- normalizedMatrix[, match(colnames(obj), + colnames(normalizedMatrix))] + } else { + normalizedMatrix <- normalize.quantiles(exprs(obj)) + colnames(normalizedMatrix) <- colnames(obj) + } + } + assayData(obj)[["normalizedMatrix"]] <- normalizedMatrix + storageMode(obj) <- "lockedEnvironment" + obj +} + +#' Plot classical MDS of dataset +#' +#' This function plots the MDS coordinates for the "n" features of interest. Potentially uncovering batch +#' effects or feature relationships. +#' +#' @param obj ExpressionSet object or objrix. +#' @param comp Which components to display. +#' @param normalized TRUE / FALSE, use the normalized matrix or raw counts. +#' @param distFun Distance function, default is dist. +#' @param distMethod The distance measure to be used. This must be one of "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski". Any unambiguous substring can be given. +#' @param n Number of features to make use of in calculating your distances. +#' @param samples Perform on samples or genes. +#' @param log TRUE/FALSE log2-transform raw counts. +#' @param plotFlag TRUE/FALSE whether to plot or not. +#' @param ... Additional plot arguments. +#' @return coordinates +#' +#' @importFrom matrixStats rowSds +#' @importFrom stats dist +#' @importFrom stats cmdscale +#' @importFrom graphics plot +#' +#' @export +#' @examples +#' data(skin) +#' res <- plotCMDS(skin,pch=21,bg=factor(pData(skin)$SMTSD)) +#' \donttest{ +#' # library(calibrate) +#' # textxy(X=res[,1],Y=res[,2],labs=rownames(res)) +#' } +plotCMDS <- function(obj, comp = 1:2, normalized = FALSE, distFun = dist, + distMethod = "euclidian", n = NULL, samples = TRUE, log = TRUE, + plotFlag = TRUE, ...) { + if (is.null(n)) + n <- min(nrow(obj), 1000) + obj <- extractMatrix(obj, normalized, log) + genesToKeep <- which(rowSums(obj) > 0) + geneVars <- rowSds(obj[genesToKeep, ]) + geneIndices <- genesToKeep[order(geneVars, decreasing = TRUE)[seq_len(n)]] + obj <- obj[geneIndices, ] + + if (samples == TRUE) { + obj <- t(obj) + } + d <- distFun(obj, method = distMethod) + ord <- cmdscale(d, k = max(comp)) + xl <- paste("MDS component:", comp[1]) + yl <- paste("MDS component:", comp[2]) + + if (plotFlag == TRUE) + plot(ord[, comp], ylab = yl, xlab = xl, ...) + invisible(ord[, comp]) +} + +#' Density plots of columns in a matrix +#' +#' Plots the density of the columns of a matrix. Wrapper for \code{\link[quantro]{matdensity}}. +#' +#' @param obj ExpressionSet object +#' @param groups Vector of labels for each sample or a column name of the phenoData slot +#' for the ids to filter. Default is the column names. +#' @param normalized TRUE / FALSE, use the normalized matrix or log2-transformed raw counts +#' @param legendPos Legend title position. If null, does not create legend by default. +#' @param ... Extra parameters for \link[quantro]{matdensity}. +#' +#' @return A density plot for each column in the ExpressionSet object colored by groups +#' @export +#' +#' @importFrom quantro matdensity +#' @importFrom Biobase assayData +#' @importFrom Biobase storageMode +#' @importFrom graphics legend +#' +#' @examples +#' data(skin) +#' filtData <- filterLowGenes(skin,"SMTSD") +#' plotDensity(filtData,groups="SMTSD",legendPos="topleft") +#' # to remove the legend +#' plotDensity(filtData,groups="SMTSD") +#' +plotDensity <- function(obj, groups = NULL, normalized = FALSE, + legendPos = NULL, ...) { + if (length(groups) == 1) { + groups <- factor(pData(obj)[, groups]) + } + mat <- extractMatrix(obj, normalized, log = TRUE) + matdensity(mat, groupFactor = groups, ...) + if (!is.null(legendPos)) { + legend(legendPos, legend = levels(groups), fill = 1:length(levels(groups)), + box.col = NA) + } +} + +#' Plot heatmap of most variable genes +#' +#' This function plots a heatmap of the gene expressions forthe "n" features of interest. +#' +#' @param obj ExpressionSet object or objrix. +#' @param n Number of features to make use of in plotting heatmap. +#' @param fun Function to sort genes by, default \code{\link[stats]{sd}}. +#' @param normalized TRUE / FALSE, use the normalized matrix or raw counts. +#' @param log TRUE/FALSE log2-transform raw counts. +#' @param ... Additional plot arguments for \code{\link[gplots]{heatmap.2}}. +#' @return coordinates +#' +#' @importFrom gplots heatmap.2 +#' @importFrom RColorBrewer brewer.pal +#' @importFrom stats sd +#' +#' @export +#' @examples +#' data(skin) +#' tissues <- pData(skin)$SMTSD +#' plotHeatmap(skin,normalized=FALSE,log=TRUE,trace="none",n=10) +#' # Even prettier +#' \donttest{ +#' # library(RColorBrewer) +#' data(skin) +#' tissues <- pData(skin)$SMTSD +#' heatmapColColors <- brewer.pal(12,"Set3")[as.integer(factor(tissues))] +#' heatmapCols <- colorRampPalette(brewer.pal(9, "RdBu"))(50) +#' plotHeatmap(skin,normalized=FALSE,log=TRUE,trace="none",n=10, +#' col = heatmapCols,ColSideColors = heatmapColColors,cexRow = 0.6,cexCol = 0.6) +#'} +plotHeatmap <- function(obj, n = NULL, fun = stats::sd, normalized = TRUE, + log = TRUE, ...) { + if (is.null(n)) + n <- min(nrow(obj), 100) + mat <- extractMatrix(obj, normalized, log) + genesToKeep <- which(rowSums(mat) > 0) + geneStats <- apply(mat[genesToKeep, ], 1, fun) + geneIndices <- genesToKeep[order(geneStats, decreasing = TRUE)[seq_len(n)]] + mat <- mat[geneIndices, ] + heatmap.2(mat, ...) + invisible(mat) +} + +#' Quantile shrinkage normalization +#' +#' This function was modified from github user kokrah. +#' +#' @param obj for counts use log2(raw counts + 1)), for MA use log2(raw intensities) +#' @param groups groups to which samples belong (character vector) +#' @param norm.factors scaling normalization factors +#' @param plot plot weights? (default=FALSE) +#' @param window window size for running median (a fraction of the number of rows of exprs) +#' @param log Whether or not the data should be log transformed before normalization, TRUE = YES. +#' +#' @importFrom stats ave +#' @importFrom graphics par +#' @importFrom graphics abline +#' +#' @return Normalized expression +#' +#' @source \href{https://raw.githubusercontent.com/kokrah/qsmooth/master/R/qsmooth.r}{Kwame Okrah's qsmooth R package} +#' @examples +#' data(skin) +#' head(netZooR:::qsmooth(skin,groups=pData(skin)$SMTSD)) +#' +qsmooth <- function(obj, groups, norm.factors = NULL, plot = FALSE, + window = 0.05,log=TRUE) { + stopifnot(class(obj)=="ExpressionSet") + if(log==TRUE){ + exprs <- log2(exprs(obj)+1) + } else { + exprs <- exprs(obj) + } + # Stop if exprs contains any NA + if (any(is.na(exprs))) + stop("exprs contains NAs (K.Okrah)") + # Scale normalization step + if (is.null(norm.factors)) { + dat <- exprs + } else { + dat <- t(t(exprs) - norm.factors) + } + # Compute quantile stats + qs <- qstats(dat, groups, window = window) + Qref <- qs$Qref + Qhat <- qs$Qhat + w <- qs$smoothWeights + # Weighted quantiles + normExprs <- w * Qref + (1 - w) * Qhat + # Re-order normExprs by rank of exprs (columnwise) + for (i in 1:ncol(normExprs)) { + # Grab ref. i + ref <- normExprs[, i] + # Grab exprs column i + x <- exprs[, i] + # Grab ranks of x (using min rank for ties) + rmin <- rank(x, ties.method = "min") + # If x has rank ties then average the values of ref at those + # ranks + dups <- duplicated(rmin) + if (any(dups)) { + # Grab ranks of x (using random ranks for ties) (needed to + # uniquely identify the indices of tied ranks) + rrand <- rank(x, ties.method = "random") + # Grab tied ranks + tied.ranks <- unique(rmin[dups]) + for (k in tied.ranks) { + sel <- rrand[rmin == k] # Select the indices of tied ranks + ref[sel] <- ave(ref[sel]) + } + } + # Re-order ref and replace in normExprs + normExprs[, i] <- ref[rmin] + } + # Plot weights + if (plot) { + oldpar <- par(mar = c(4, 4, 1.5, 0.5)) + lq <- length(Qref) + u <- (1:lq - 0.5)/lq + if (length(u) > 10000) { + # do not plot more than 10000 points + sel <- sample(1:lq, 10000) + plot(u[sel], w[sel], pch = ".", main = "qsmooth weights", + xlab = " quantiles", ylab = "Weight", ylim = c(0, + 1)) + } else { + plot(u, w, pch = ".", main = "qsmooth weights", xlab = "quantiles", + ylab = "Weight", ylim = c(0, 1)) + } + abline(h = 0.5, v = 0.5, col = "red", lty = 2) + par(oldpar) + } + rownames(normExprs) <- rownames(exprs) + colnames(normExprs) <- colnames(exprs) + normExprs +} + +#' Compute quantile statistics +#' +#' This function was directly borrowed from github user kokrah. +#' +#' @param exprs for counts use log2(raw counts + 1)), for MA use log2(raw intensities) +#' @param groups groups to which samples belong (character vector) +#' @param window window size for running median as a fraction on the number of rows of exprs +#' +#' @importFrom stats runmed +#' @importFrom stats model.matrix +#' +#' @return list of statistics +#' +#' @source \href{https://raw.githubusercontent.com/kokrah/qsmooth/master/R/qstats.r}{Kwame Okrah's qsmooth R package} +#' Compute quantile statistics +#' +qstats <- function(exprs, groups, window) { + # Compute sample quantiles + Q <- apply(exprs, 2, sort) + # Compute quantile reference + Qref <- rowMeans(Q) + # Compute SST + SST <- rowSums((Q - Qref)^2) + # Compute SSB + f <- factor(as.character(groups)) + X <- model.matrix(~0 + f) + QBETAS <- t(solve(t(X) %*% X) %*% t(X) %*% t(Q)) + Qhat <- QBETAS %*% t(X) + SSB <- rowSums((Qhat - Qref)^2) + # Compute weights + roughWeights <- 1 - SSB/SST + roughWeights[SST < 1e-06] <- 1 + # Compute smooth weights + k <- floor(window * nrow(Q)) + if (k%%2 == 0) + k <- k + 1 + smoothWeights <- runmed(roughWeights, k = k, endrule = "constant") + list(Q = Q, Qref = Qref, Qhat = Qhat, QBETAS = QBETAS, SST = SST, + SSB = SSB, SSE = SST - SSB, roughWeights = roughWeights, + smoothWeights = smoothWeights) +} + +#' Skin RNA-seq data from the GTEx consortium +#' +#' Skin RNA-seq data from the GTEx consortium. V6 release. Random selection of 20 skin samples. +#' 13 of the samples are fibroblast cells, 5 Skin sun exposed, 2 sun unexposed. +#' +#' @docType data +#' +#' @usage data(skin) +#' +#' @format An object of class \code{"ExpressionSet"}; see \code{\link[Biobase]{ExpressionSet}}. +#' +#' @keywords datasets +#' +#' @return ExpressionSet object +#' +#' @references GTEx Consortium, 2015. The Genotype-Tissue Expression (GTEx) pilot analysis: Multitissue gene regulation in humans. Science, 348(6235), pp.648-660. +#' (\href{http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4547484/}{PubMed}) +#' +#' @source GTEx Portal +#' @name Skin_data +#' @examples +#' \donttest{data(skin); +#' checkMissAnnotation(skin,"GENDER");} +system('wget https://netzoo.s3.us-east-2.amazonaws.com/netZooR/unittest_datasets/yarn/skin.rdata') +system('mv skin.rdata data/') +"skin" + + + +#' Bladder RNA-seq data from the GTEx consortium +#' +#' Bladder RNA-seq data from the GTEx consortium. V6 release. +#' +#' @docType data +#' +#' @usage data(bladder) +#' +#' @format An object of class \code{"ExpressionSet"}; see \code{\link[Biobase]{ExpressionSet}}. +#' +#' @keywords datasets +#' +#' @references GTEx Consortium, 2015. The Genotype-Tissue Expression (GTEx) pilot analysis: Multitissue gene regulation in humans. Science, 348(6235), pp.648-660. +#' (\href{http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4547484/}{PubMed}) +#' +#' @source GTEx Portal +#' +#' @return ExpressionSet object +#' @name Bladder_data +#' @examples +#' \donttest{data(bladder); +#' checkMissAnnotation(bladder);} +system('wget https://netzoo.s3.us-east-2.amazonaws.com/netZooR/unittest_datasets/yarn/bladder.rdata') +system('mv bladder.rdata data/') +"bladder" diff --git a/README.md b/README.md index 2b7dec09..166e6e03 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ [![discussions](https://img.shields.io/badge/netZooR-discussions-orange)](https://github.com/netZoo/netZooR/discussions) [![DOI](https://zenodo.org/badge/190646802.svg)](https://zenodo.org/badge/latestdoi/190646802) -netZooR is tested on: (OS: Ubuntu + Macos) X (Language: R v4.2) +netZooR is tested on: (OS: Ubuntu + Macos) X (Language: R v4.3) ## Description netZooR is an R package to reconstruct, analyse, and plot biological networks. @@ -186,7 +186,7 @@ To report a bug, please open a new [issue](https://github.com/netZoo/netZooR/iss ## Tutorials For more details please refer to the [documentation website](https://netzoo.github.io/netZooR/). Tutorials are available at the top navigation bar **Articles/** for basic usage and application cases. -Or use `browseVignettes("netZooR")` after installing the package. Also check [netbooks](http://netbooks.networkmedicine.org) to go through the tutorials on a Jupyter notebook cloud server. +Or use `browseVignettes("netZooR")` after installing the package. [Netbooks](http://netbooks.networkmedicine.org) deploys tutorials on a Jupyter notebook cloud server to get you running without any installation. ## Contribution and Development Contributions are welcome! The contribution guide to netZooR can be found [here](https://netzoo.github.io/contribute/contribute/). diff --git a/man/Bladder_data.Rd b/man/Bladder_data.Rd new file mode 100644 index 00000000..ad182922 --- /dev/null +++ b/man/Bladder_data.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/YARN.R +\docType{data} +\name{Bladder_data} +\alias{Bladder_data} +\title{Bladder RNA-seq data from the GTEx consortium} +\format{ +An object of class \code{"ExpressionSet"}; see \code{\link[Biobase]{ExpressionSet}}. +} +\source{ +GTEx Portal +} +\usage{ +data(bladder) +} +\value{ +ExpressionSet object +} +\description{ +Bladder RNA-seq data from the GTEx consortium. V6 release. +} +\examples{ +\donttest{data(bladder); +checkMissAnnotation(bladder);} +} +\references{ +GTEx Consortium, 2015. The Genotype-Tissue Expression (GTEx) pilot analysis: Multitissue gene regulation in humans. Science, 348(6235), pp.648-660. +(\href{http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4547484/}{PubMed}) +} +\keyword{datasets} diff --git a/man/Skin_data.Rd b/man/Skin_data.Rd new file mode 100644 index 00000000..41cd0e30 --- /dev/null +++ b/man/Skin_data.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/YARN.R +\docType{data} +\name{Skin_data} +\alias{Skin_data} +\title{Skin RNA-seq data from the GTEx consortium} +\format{ +An object of class \code{"ExpressionSet"}; see \code{\link[Biobase]{ExpressionSet}}. +} +\source{ +GTEx Portal +} +\usage{ +data(skin) +} +\value{ +ExpressionSet object +} +\description{ +Skin RNA-seq data from the GTEx consortium. V6 release. Random selection of 20 skin samples. +13 of the samples are fibroblast cells, 5 Skin sun exposed, 2 sun unexposed. +} +\examples{ +\donttest{data(skin); +checkMissAnnotation(skin,"GENDER");} +} +\references{ +GTEx Consortium, 2015. The Genotype-Tissue Expression (GTEx) pilot analysis: Multitissue gene regulation in humans. Science, 348(6235), pp.648-660. +(\href{http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4547484/}{PubMed}) +} +\keyword{datasets} diff --git a/man/annotateFromBiomart.Rd b/man/annotateFromBiomart.Rd new file mode 100644 index 00000000..c2f2e089 --- /dev/null +++ b/man/annotateFromBiomart.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/YARN.R +\name{annotateFromBiomart} +\alias{annotateFromBiomart} +\title{Annotate your Expression Set with biomaRt} +\usage{ +annotateFromBiomart( + obj, + genes = featureNames(obj), + filters = "ensembl_gene_id", + attributes = c("ensembl_gene_id", "hgnc_symbol", "chromosome_name", "start_position", + "end_position"), + biomart = "ensembl", + dataset = "hsapiens_gene_ensembl", + ... +) +} +\arguments{ +\item{obj}{ExpressionSet object.} + +\item{genes}{Genes or rownames of the ExpressionSet.} + +\item{filters}{getBM filter value, see getBM help file.} + +\item{attributes}{getBM attributes value, see getBM help file.} + +\item{biomart}{BioMart database name you want to connect to. Possible database names can be retrieved with teh function listMarts.} + +\item{dataset}{Dataset you want to use. To see the different datasets available within a biomaRt you can e.g. do: mart = useMart('ensembl'), followed by listDatasets(mart).} + +\item{...}{Values for useMart, see useMart help file.} +} +\value{ +ExpressionSet object with a fuller featureData. +} +\description{ +Annotate your Expression Set with biomaRt +} +\examples{ +download.file('https://netzoo.s3.us-east-2.amazonaws.com/netZooR/unittest_datasets/yarn/bladder.rdata',destfile='netZooR/data/bladder.rdata') +download.file('https://netzoo.s3.us-east-2.amazonaws.com/netZooR/unittest_datasets/yarn/skin.rdata',destfile='netZooR/data/skin.rdata') +data(skin) +# subsetting and changing column name just for a silly example +skin <- skin[1:10,] +colnames(fData(skin)) = paste("names",1:6) +biomart<-"ENSEMBL_MART_ENSEMBL"; +genes <- sapply(strsplit(rownames(skin),split="\\\\."),function(i)i[1]) +newskin <-annotateFromBiomart(skin,genes=genes,biomart=biomart) +head(fData(newskin)[,7:11]) + +} diff --git a/man/checkMisAnnotation.Rd b/man/checkMisAnnotation.Rd new file mode 100644 index 00000000..0cafdff9 --- /dev/null +++ b/man/checkMisAnnotation.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/YARN.R +\name{checkMisAnnotation} +\alias{checkMisAnnotation} +\title{Check for wrong annotation of a sample using classical MDS and control genes.} +\usage{ +checkMisAnnotation( + obj, + phenotype, + controlGenes = "all", + columnID = "chromosome_name", + plotFlag = TRUE, + legendPosition = NULL, + ... +) +} +\arguments{ +\item{obj}{ExpressionSet object.} + +\item{phenotype}{phenotype column name in the phenoData slot to check.} + +\item{controlGenes}{Name of controlGenes, ie. 'Y' chromosome. Can specify 'all'.} + +\item{columnID}{Column name where controlGenes is defined in the featureData slot if other than 'all'.} + +\item{plotFlag}{TRUE/FALSE Whether to plot or not} + +\item{legendPosition}{Location for the legend.} + +\item{...}{Extra parameters for \code{\link{plotCMDS}} function.} +} +\value{ +Plots a classical multi-dimensional scaling of the 'controlGenes'. Optionally returns co-ordinates. +} +\description{ +Check for wrong annotation of a sample using classical MDS and control genes. +} +\examples{ +data(bladder) +checkMisAnnotation(bladder,'GENDER',controlGenes='Y',legendPosition='topleft') + +} diff --git a/man/checkTissuesToMerge.Rd b/man/checkTissuesToMerge.Rd new file mode 100644 index 00000000..9636b773 --- /dev/null +++ b/man/checkTissuesToMerge.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/YARN.R +\name{checkTissuesToMerge} +\alias{checkTissuesToMerge} +\title{Check tissues to merge based on gene expression profile} +\usage{ +checkTissuesToMerge( + obj, + majorGroups, + minorGroups, + filterFun = NULL, + plotFlag = TRUE, + ... +) +} +\arguments{ +\item{obj}{ExpressionSet object.} + +\item{majorGroups}{Column name in the phenoData slot that describes the general body region or site of the sample.} + +\item{minorGroups}{Column name in the phenoData slot that describes the specific body region or site of the sample.} + +\item{filterFun}{Filter group specific genes that might disrupt PCoA analysis.} + +\item{plotFlag}{TRUE/FALSE whether to plot or not} + +\item{...}{Parameters that can go to \code{\link[yarn]{checkMisAnnotation}}} +} +\value{ +CMDS Plots of the majorGroupss colored by the minorGroupss. Optional matrix of CMDS loadings for each comparison. +} +\description{ +Check tissues to merge based on gene expression profile +} +\examples{ +data(skin) +checkTissuesToMerge(skin,'SMTS','SMTSD') + +} +\seealso{ +checkTissuesToMerge +} diff --git a/man/cobra.Rd b/man/cobra.Rd index d3952aba..b88fcf9a 100644 --- a/man/cobra.Rd +++ b/man/cobra.Rd @@ -4,14 +4,14 @@ \alias{cobra} \title{Run COBRA in R} \usage{ -cobra(X, expressionData, standardize = T) +cobra(X, expressionData, method = "pearson") } \arguments{ \item{X}{: design matrix of size (n, q), n = number of samples, q = number of covariates} \item{expressionData}{: gene expression as a matrix of size (g, n), g = number of genes} -\item{standardize}{: boolean flag to standardize the gene expression as a pre-processing step +\item{method}{: if pearson, the decomposition of the co-expression matrix is compouted. If pcorsh, COBRA decomposes the regularized partial co-expression Outputs:} } diff --git a/man/downloadGTEx.Rd b/man/downloadGTEx.Rd new file mode 100644 index 00000000..1b5dc95b --- /dev/null +++ b/man/downloadGTEx.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/YARN.R +\name{downloadGTEx} +\alias{downloadGTEx} +\title{Download GTEx files and turn them into ExpressionSet object} +\usage{ +downloadGTEx(type = "genes", file = NULL, ...) +} +\arguments{ +\item{type}{Type of counts to download - default genes.} + +\item{file}{File path and name to automatically save the downloaded GTEx expression set. Saves as a RDS file.} + +\item{...}{Does nothing currently.} +} +\value{ +Organized ExpressionSet set. +} +\description{ +Downloads the V6 GTEx release and turns it into an ExpressionSet object. +} +\examples{ +# obj <- downloadGTEx(type='genes',file='~/Desktop/gtex.rds') +} diff --git a/man/extractMatrix.Rd b/man/extractMatrix.Rd new file mode 100644 index 00000000..26d422d8 --- /dev/null +++ b/man/extractMatrix.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/YARN.R +\name{extractMatrix} +\alias{extractMatrix} +\title{Extract the appropriate matrix} +\usage{ +extractMatrix(obj, normalized = FALSE, log = TRUE) +} +\arguments{ +\item{obj}{ExpressionSet object or objrix.} + +\item{normalized}{TRUE / FALSE, use the normalized matrix or raw counts} + +\item{log}{TRUE/FALSE log2-transform.} +} +\value{ +matrix +} +\description{ +This returns the raw counts, log2-transformed raw counts, or normalized expression. +If normalized = TRUE then the log paramater is ignored. +} +\examples{ + +data(skin) +head(netZooR:::extractMatrix(skin,normalized=FALSE,log=TRUE)) +head(netZooR:::extractMatrix(skin,normalized=FALSE,log=FALSE)) + +} diff --git a/man/filterGenes.Rd b/man/filterGenes.Rd new file mode 100644 index 00000000..8343257e --- /dev/null +++ b/man/filterGenes.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/YARN.R +\name{filterGenes} +\alias{filterGenes} +\title{Filter specific genes} +\usage{ +filterGenes( + obj, + labels = c("X", "Y", "MT"), + featureName = "chromosome_name", + keepOnly = FALSE +) +} +\arguments{ +\item{obj}{ExpressionSet object.} + +\item{labels}{Labels of genes to filter or keep, eg. X, Y, and MT} + +\item{featureName}{FeatureData column name, eg. chr} + +\item{keepOnly}{Filter or keep only the genes with those labels} +} +\value{ +Filtered ExpressionSet object +} +\description{ +The main use case for this function is the removal of sex-chromosome genes. +Alternatively, filter genes that are not protein-coding. +} +\examples{ +data(skin) +filterGenes(skin,labels = c('X','Y','MT'),featureName='chromosome_name') +filterGenes(skin,labels = 'protein_coding',featureName='gene_biotype',keepOnly=TRUE) + +} diff --git a/man/filterLowGenes.Rd b/man/filterLowGenes.Rd new file mode 100644 index 00000000..f3ac65e5 --- /dev/null +++ b/man/filterLowGenes.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/YARN.R +\name{filterLowGenes} +\alias{filterLowGenes} +\title{Filter genes that have less than a minimum threshold CPM for a given group/tissue} +\usage{ +filterLowGenes(obj, groups, threshold = 1, minSamples = NULL, ...) +} +\arguments{ +\item{obj}{ExpressionSet object.} + +\item{groups}{Vector of labels for each sample or a column name of the phenoData slot. +for the ids to filter. Default is the column names.} + +\item{threshold}{The minimum threshold for calling presence of a gene in a sample.} + +\item{minSamples}{Minimum number of samples - defaults to half the minimum group size.} + +\item{...}{Options for \link[edgeR]{cpm}.} +} +\value{ +Filtered ExpressionSet object +} +\description{ +Filter genes that have less than a minimum threshold CPM for a given group/tissue +} +\examples{ +data(skin) +filterLowGenes(skin,'SMTSD') + +} +\seealso{ +\link[edgeR]{cpm} function defined in the edgeR package. +} diff --git a/man/filterMissingGenes.Rd b/man/filterMissingGenes.Rd new file mode 100644 index 00000000..184c7ae7 --- /dev/null +++ b/man/filterMissingGenes.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/YARN.R +\name{filterMissingGenes} +\alias{filterMissingGenes} +\title{Filter genes not expressed in any sample} +\usage{ +filterMissingGenes(obj, threshold = 0) +} +\arguments{ +\item{obj}{ExpressionSet object.} + +\item{threshold}{Minimum sum of gene counts across samples -- defaults to zero.} +} +\value{ +Filtered ExpressionSet object +} +\description{ +The main use case for this function is the removal of missing genes. +} +\examples{ +data(skin) +filterMissingGenes(skin) + +} diff --git a/man/filterSamples.Rd b/man/filterSamples.Rd new file mode 100644 index 00000000..7a6e419a --- /dev/null +++ b/man/filterSamples.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/YARN.R +\name{filterSamples} +\alias{filterSamples} +\title{Filter samples} +\usage{ +filterSamples(obj, ids, groups = colnames(obj), keepOnly = FALSE) +} +\arguments{ +\item{obj}{ExpressionSet object.} + +\item{ids}{Names found within the groups labels corresponding to samples to be removed} + +\item{groups}{Vector of labels for each sample or a column name of the phenoData slot +for the ids to filter. Default is the column names.} + +\item{keepOnly}{Filter or keep only the samples with those labels.} +} +\value{ +Filtered ExpressionSet object +} +\description{ +Filter samples +} +\examples{ +data(skin) +filterSamples(skin,ids = "Skin - Not Sun Exposed (Suprapubic)",groups="SMTSD") +filterSamples(skin,ids=c("GTEX-OHPL-0008-SM-4E3I9","GTEX-145MN-1526-SM-5SI9T")) + +} diff --git a/man/normalizeTissueAware.Rd b/man/normalizeTissueAware.Rd new file mode 100644 index 00000000..2bac2f64 --- /dev/null +++ b/man/normalizeTissueAware.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/YARN.R +\name{normalizeTissueAware} +\alias{normalizeTissueAware} +\title{Normalize in a tissue aware context} +\source{ +The function qsmooth comes from the qsmooth packages +currently available on github under user 'kokrah'. +} +\usage{ +normalizeTissueAware( + obj, + groups, + normalizationMethod = c("qsmooth", "quantile"), + ... +) +} +\arguments{ +\item{obj}{ExpressionSet object} + +\item{groups}{Vector of labels for each sample or a column name of the phenoData slot +for the ids to filter. Default is the column names} + +\item{normalizationMethod}{Choice of 'qsmooth' or 'quantile'} + +\item{...}{Options for \code{\link{qsmooth}} function or \code{\link[limma]{normalizeQuantiles}}} +} +\value{ +ExpressionSet object with an assayData called normalizedMatrix +} +\description{ +This function provides a wrapper to various normalization methods developed. +Currently it only wraps qsmooth and quantile normalization returning a log-transformed +normalized matrix. qsmooth is a normalization approach that normalizes samples in +a condition aware manner. +} +\examples{ +data(skin) +normalizeTissueAware(skin,"SMTSD") + +} diff --git a/man/plotCMDS.Rd b/man/plotCMDS.Rd new file mode 100644 index 00000000..ddb2c0ef --- /dev/null +++ b/man/plotCMDS.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/YARN.R +\name{plotCMDS} +\alias{plotCMDS} +\title{Plot classical MDS of dataset} +\usage{ +plotCMDS( + obj, + comp = 1:2, + normalized = FALSE, + distFun = dist, + distMethod = "euclidian", + n = NULL, + samples = TRUE, + log = TRUE, + plotFlag = TRUE, + ... +) +} +\arguments{ +\item{obj}{ExpressionSet object or objrix.} + +\item{comp}{Which components to display.} + +\item{normalized}{TRUE / FALSE, use the normalized matrix or raw counts.} + +\item{distFun}{Distance function, default is dist.} + +\item{distMethod}{The distance measure to be used. This must be one of "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski". Any unambiguous substring can be given.} + +\item{n}{Number of features to make use of in calculating your distances.} + +\item{samples}{Perform on samples or genes.} + +\item{log}{TRUE/FALSE log2-transform raw counts.} + +\item{plotFlag}{TRUE/FALSE whether to plot or not.} + +\item{...}{Additional plot arguments.} +} +\value{ +coordinates +} +\description{ +This function plots the MDS coordinates for the "n" features of interest. Potentially uncovering batch +effects or feature relationships. +} +\examples{ +data(skin) +res <- plotCMDS(skin,pch=21,bg=factor(pData(skin)$SMTSD)) +\donttest{ +# library(calibrate) +# textxy(X=res[,1],Y=res[,2],labs=rownames(res)) +} +} diff --git a/man/plotDensity.Rd b/man/plotDensity.Rd new file mode 100644 index 00000000..17659732 --- /dev/null +++ b/man/plotDensity.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/YARN.R +\name{plotDensity} +\alias{plotDensity} +\title{Density plots of columns in a matrix} +\usage{ +plotDensity(obj, groups = NULL, normalized = FALSE, legendPos = NULL, ...) +} +\arguments{ +\item{obj}{ExpressionSet object} + +\item{groups}{Vector of labels for each sample or a column name of the phenoData slot +for the ids to filter. Default is the column names.} + +\item{normalized}{TRUE / FALSE, use the normalized matrix or log2-transformed raw counts} + +\item{legendPos}{Legend title position. If null, does not create legend by default.} + +\item{...}{Extra parameters for \link[quantro]{matdensity}.} +} +\value{ +A density plot for each column in the ExpressionSet object colored by groups +} +\description{ +Plots the density of the columns of a matrix. Wrapper for \code{\link[quantro]{matdensity}}. +} +\examples{ +data(skin) +filtData <- filterLowGenes(skin,"SMTSD") +plotDensity(filtData,groups="SMTSD",legendPos="topleft") +# to remove the legend +plotDensity(filtData,groups="SMTSD") + +} diff --git a/man/plotHeatmap.Rd b/man/plotHeatmap.Rd new file mode 100644 index 00000000..c4324e3b --- /dev/null +++ b/man/plotHeatmap.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/YARN.R +\name{plotHeatmap} +\alias{plotHeatmap} +\title{Plot heatmap of most variable genes} +\usage{ +plotHeatmap(obj, n = NULL, fun = stats::sd, normalized = TRUE, log = TRUE, ...) +} +\arguments{ +\item{obj}{ExpressionSet object or objrix.} + +\item{n}{Number of features to make use of in plotting heatmap.} + +\item{fun}{Function to sort genes by, default \code{\link[stats]{sd}}.} + +\item{normalized}{TRUE / FALSE, use the normalized matrix or raw counts.} + +\item{log}{TRUE/FALSE log2-transform raw counts.} + +\item{...}{Additional plot arguments for \code{\link[gplots]{heatmap.2}}.} +} +\value{ +coordinates +} +\description{ +This function plots a heatmap of the gene expressions forthe "n" features of interest. +} +\examples{ +data(skin) +tissues <- pData(skin)$SMTSD +plotHeatmap(skin,normalized=FALSE,log=TRUE,trace="none",n=10) +# Even prettier +\donttest{ +# library(RColorBrewer) +data(skin) +tissues <- pData(skin)$SMTSD +heatmapColColors <- brewer.pal(12,"Set3")[as.integer(factor(tissues))] +heatmapCols <- colorRampPalette(brewer.pal(9, "RdBu"))(50) +plotHeatmap(skin,normalized=FALSE,log=TRUE,trace="none",n=10, + col = heatmapCols,ColSideColors = heatmapColColors,cexRow = 0.6,cexCol = 0.6) +} +} diff --git a/man/prior.pp.Rd b/man/prior.pp.Rd deleted file mode 100644 index 04719523..00000000 --- a/man/prior.pp.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/TIGER.R -\name{prior.pp} -\alias{prior.pp} -\title{Filter low confident edge signs in the prior network using GeneNet} -\usage{ -prior.pp(prior, expr) -} -\arguments{ -\item{prior}{A prior network (adjacency matrix) with rows as TFs and columns as genes.} - -\item{expr}{A normalized log-transformed gene expression matrix.} -} -\value{ -A filtered prior network (adjacency matrix). -} -\description{ -Filter low confident edge signs in the prior network using GeneNet -} diff --git a/man/qsmooth.Rd b/man/qsmooth.Rd new file mode 100644 index 00000000..615555ad --- /dev/null +++ b/man/qsmooth.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/YARN.R +\name{qsmooth} +\alias{qsmooth} +\title{Quantile shrinkage normalization} +\source{ +\href{https://raw.githubusercontent.com/kokrah/qsmooth/master/R/qsmooth.r}{Kwame Okrah's qsmooth R package} +} +\usage{ +qsmooth( + obj, + groups, + norm.factors = NULL, + plot = FALSE, + window = 0.05, + log = TRUE +) +} +\arguments{ +\item{obj}{for counts use log2(raw counts + 1)), for MA use log2(raw intensities)} + +\item{groups}{groups to which samples belong (character vector)} + +\item{norm.factors}{scaling normalization factors} + +\item{plot}{plot weights? (default=FALSE)} + +\item{window}{window size for running median (a fraction of the number of rows of exprs)} + +\item{log}{Whether or not the data should be log transformed before normalization, TRUE = YES.} +} +\value{ +Normalized expression +} +\description{ +This function was modified from github user kokrah. +} +\examples{ +data(skin) +head(netZooR:::qsmooth(skin,groups=pData(skin)$SMTSD)) + +} diff --git a/man/qstats.Rd b/man/qstats.Rd new file mode 100644 index 00000000..401e8beb --- /dev/null +++ b/man/qstats.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/YARN.R +\name{qstats} +\alias{qstats} +\title{Compute quantile statistics} +\source{ +\href{https://raw.githubusercontent.com/kokrah/qsmooth/master/R/qstats.r}{Kwame Okrah's qsmooth R package} +Compute quantile statistics +} +\usage{ +qstats(exprs, groups, window) +} +\arguments{ +\item{exprs}{for counts use log2(raw counts + 1)), for MA use log2(raw intensities)} + +\item{groups}{groups to which samples belong (character vector)} + +\item{window}{window size for running median as a fraction on the number of rows of exprs} +} +\value{ +list of statistics +} +\description{ +This function was directly borrowed from github user kokrah. +} diff --git a/man/TIGER.Rd b/man/tiger.Rd similarity index 100% rename from man/TIGER.Rd rename to man/tiger.Rd diff --git a/vignettes/yarn.Rmd b/vignettes/yarn.Rmd index 048ce603..bb2f0a9d 100644 --- a/vignettes/yarn.Rmd +++ b/vignettes/yarn.Rmd @@ -65,33 +65,33 @@ load("skin.rdata") 2. Check mis-annotation of gender or other phenotypes using group-specific genes ```{r checkMisAnnotation} -yarn::checkMisAnnotation(skin,"GENDER",controlGenes="Y",legendPosition="topleft") +netZooR::checkMisAnnotation(skin,"GENDER",controlGenes="Y",legendPosition="topleft") ``` 3. Decide what sub-groups should be merged ```{r checkTissuesToMerge} -yarn::checkTissuesToMerge(skin,"SMTS","SMTSD") +netZooR::checkTissuesToMerge(skin,"SMTS","SMTSD") ``` 4. Filter lowly expressed genes ```{r filterGenes} -skin_filtered = yarn::filterLowGenes(skin,"SMTSD") +skin_filtered = netZooR::filterLowGenes(skin,"SMTSD") dim(skin) dim(skin_filtered) ``` Or group specific genes ```{r filter} -tmp = yarn::filterGenes(skin,labels=c("X","Y","MT"),featureName = "chromosome_name") +tmp = netZooR::filterGenes(skin,labels=c("X","Y","MT"),featureName = "chromosome_name") # Keep only the sex names -tmp = yarn::filterGenes(skin,labels=c("X","Y","MT"),featureName = "chromosome_name",keepOnly=TRUE) +tmp = netZooR::filterGenes(skin,labels=c("X","Y","MT"),featureName = "chromosome_name",keepOnly=TRUE) ``` 5. Normalize in a tissue or group-aware manner ```{r density} -yarn::plotDensity(skin_filtered,"SMTSD",main=expression('log'[2]*' raw expression')) -skin_filtered = yarn::normalizeTissueAware(skin_filtered,"SMTSD") -yarn::plotDensity(skin_filtered,"SMTSD",normalized=TRUE,main="Normalized") +netZooR::plotDensity(skin_filtered,"SMTSD",main=expression('log'[2]*' raw expression')) +skin_filtered = netZooR::normalizeTissueAware(skin_filtered,"SMTSD") +netZooR::plotDensity(skin_filtered,"SMTSD",normalized=TRUE,main="Normalized") ``` ## Helper functions @@ -102,13 +102,13 @@ We include, `plotCMDS`, `plotDensity`, `plotHeatmap`. `plotCMDS` - PCoA / Classical Multi-Dimensional Scaling of the most variable genes. ```{r} data(skin) -res = yarn::plotCMDS(skin,pch=21,bg=factor(pData(skin)$SMTSD)) +res = netZooR::plotCMDS(skin,pch=21,bg=factor(pData(skin)$SMTSD)) ``` `plotDensity` - Density plots colored by phenotype of choosing. Allows for inspection of global trend differences. ```{r} -filtData = yarn::filterLowGenes(skin,"SMTSD") -yarn::plotDensity(filtData,groups="SMTSD",legendPos="topleft") +filtData = netZooR::filterLowGenes(skin,"SMTSD") +netZooR::plotDensity(filtData,groups="SMTSD",legendPos="topleft") ``` `plotHeatmap` - Heatmap of the most variable genes. @@ -117,7 +117,7 @@ library(RColorBrewer) tissues = pData(skin)$SMTSD heatmapColColors=brewer.pal(12,"Set3")[as.integer(factor(tissues))] heatmapCols = colorRampPalette(brewer.pal(9, "RdBu"))(50) -yarn::plotHeatmap(skin,normalized=FALSE,log=TRUE,trace="none",n=10, +netZooR::plotHeatmap(skin,normalized=FALSE,log=TRUE,trace="none",n=10, col = heatmapCols,ColSideColors = heatmapColColors,cexRow = 0.25,cexCol = 0.25) ```