diff --git a/.Rbuildignore b/.Rbuildignore index 466bfa05..f2deaa14 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -33,3 +33,4 @@ ^data-raw$ ^CRAN-SUBMISSION$ ^AUTHORS$ +^dev$ diff --git a/DESCRIPTION b/DESCRIPTION index 49e833de..b12f1fb6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,65 +2,74 @@ Package: PLNmodels Title: Poisson Lognormal Models Version: 1.2.1 Authors@R: c( - person("Julien", "Chiquet", role = c("aut", "cre"), email = "julien.chiquet@inrae.fr", - comment = c(ORCID = "0000-0002-3629-3429")), - person("Mahendra", "Mariadassou", role = "aut", email = "mahendra.mariadassou@inrae.fr", - comment = c(ORCID = "0000-0003-2986-354X")), - person("Stéphane", "Robin", role ="aut", email = "stephane.robin@inrae.fr"), - person("François", "Gindraud", role = "aut", email = "francois.gindraud@gmail.com"), - person("Julie", "Aubert", role = "ctb", email = "julie.aubert@inrae.fr"), - person("Bastien", "Batardière", role = "ctb", email = "bastien.batardiere@inrae.fr"), - person("Giovanni", "Poggiato", role = "ctb", email = "giov.poggiato@gmail.com"), - person("Cole", "Trapnell", role = "ctb", email = "coletrap@uw.edu"), - person("Maddy", "Duran", role = "ctb", email = "duran@uw.edu") + person("Julien", "Chiquet", , "julien.chiquet@inrae.fr", role = c("aut", "cre"), + comment = c(ORCID = "0000-0002-3629-3429")), + person("Mahendra", "Mariadassou", , "mahendra.mariadassou@inrae.fr", role = "aut", + comment = c(ORCID = "0000-0003-2986-354X")), + person("Stéphane", "Robin", , "stephane.robin@inrae.fr", role = "aut"), + person("François", "Gindraud", , "francois.gindraud@gmail.com", role = "aut"), + person("Julie", "Aubert", , "julie.aubert@inrae.fr", role = "ctb"), + person("Bastien", "Batardière", , "bastien.batardiere@inrae.fr", role = "ctb"), + person("Giovanni", "Poggiato", , "giov.poggiato@gmail.com", role = "ctb"), + person("Cole", "Trapnell", , "coletrap@uw.edu", role = "ctb"), + person("Maddy", "Duran", , "duran@uw.edu", role = "ctb") ) -Description: The Poisson-lognormal model and variants (Chiquet, Mariadassou and Robin, - 2021 ) can be used for - a variety of multivariate problems when count data are at play, including - principal component analysis for count data, discriminant analysis, model-based clustering and - network inference. Implements variational algorithms to fit such models accompanied with a set of - functions for visualization and diagnostic. +Description: The Poisson-lognormal model and variants (Chiquet, + Mariadassou and Robin, 2021 ) can be + used for a variety of multivariate problems when count data are at + play, including principal component analysis for count data, + discriminant analysis, model-based clustering and network inference. + Implements variational algorithms to fit such models accompanied with + a set of functions for visualization and diagnostic. URL: https://pln-team.github.io/PLNmodels/ BugReports: https://github.com/pln-team/PLNmodels/issues License: GPL (>= 3) -Encoding: UTF-8 -Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 Depends: R (>= 3.6) -LazyData: true -biocViews: Imports: - methods, - stats, - MASS, - future, - future.apply, - R6, - glassoFast, - pscl, - Matrix, - Rcpp, - nloptr, - igraph, - grid, - gridExtra, - dplyr, - tidyr, - purrr, - ggplot2, - corrplot, - magrittr, - torch, - rlang + cli, + corrplot, + dplyr, + future, + future.apply, + ggplot2, + glassoFast, + grDevices, + grid, + gridExtra, + igraph, + magrittr, + MASS, + Matrix, + methods, + nloptr, + parallel, + pscl, + purrr, + R6, + Rcpp, + rlang, + scales, + stats, + tidyr, + torch Suggests: + factoextra, knitr, rmarkdown, - testthat, - pkgdown, spelling, - factoextra -LinkingTo: Rcpp, RcppArmadillo, nloptr -VignetteBuilder: knitr + testthat +LinkingTo: + nloptr, + Rcpp, + RcppArmadillo +VignetteBuilder: + knitr +biocViews: +Encoding: UTF-8 +Language: en-US +LazyData: true +Roxygen: list(markdown = TRUE) +RoxygenNote: 7.3.2 Collate: 'PLNfit-class.R' 'PLN.R' @@ -102,4 +111,3 @@ Collate: 'utils-zipln.R' 'utils.R' 'zzz.R' -Language: en-US diff --git a/NAMESPACE b/NAMESPACE index 8a6ecb49..e02d434d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -107,6 +107,7 @@ importFrom(purrr,map_dbl) importFrom(purrr,map_int) importFrom(purrr,reduce) importFrom(rlang,.data) +importFrom(scales,alpha) importFrom(stats,.getXlevels) importFrom(stats,.lm.fit) importFrom(stats,as.formula) diff --git a/R/barents.R b/R/barents.R index 0a21ee7a..c8995f5a 100644 --- a/R/barents.R +++ b/R/barents.R @@ -7,7 +7,7 @@ #' #' @format A data frame with 6 variables: #' * Abundance: A 30 fish species by 89 sites count matrix -#' * Offset: A 30 fish species by 116 samples offset matrix, measuring the sampling effort in each site +#' * Offset: A 30 fish species by 89 samples offset matrix, measuring the sampling effort in each site #' * 4 covariates for latitude, longitude, depth (in meters), temperature (in Celsius degrees). #' #' @references Fossheim, Maria, Einar M. Nilssen, and Michaela Aschan. "Fish assemblages in the Barents Sea." Marine Biology Research 2.4 (2006). \doi{10.1080/17451000600815698} diff --git a/R/import_utils.R b/R/import_utils.R index 8dff8ad0..41cb4f1b 100644 --- a/R/import_utils.R +++ b/R/import_utils.R @@ -3,19 +3,27 @@ ## %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ## Internal function to find the most comprehensive set of common samples between a count table and a covariates data.frame -common_samples <- function(counts, covariates) { +common_samples <- function(counts, covariates, call = rlang::caller_env()) { ## Have default samples names been created when matching samples? default_names <- FALSE ## Sanity checks: - name_warning <- paste("There are no matching names in the count matrix and the covariates data.frame.", - "Function will proceed assuming:", - "- samples are in the same order;", - "- samples are rows of the abundance matrix.", sep = "\n") + name_warning <- c( + "!" = "There are no matching names in {.var counts} and {.var covariates}.", + "i" = "Function will proceed assuming:", + "i" = "- samples are in the same order;", + "i" = "- samples are rows of {.var counts}.", sep = "\n" + ) + row_count_abort <- c( + "x" = "{.var counts} and {.var covariates} have different number of row(s):", + "i" = "{.var counts} has {.cls {nrow(counts)}} row(s);", + "i" = "{.var covariates} has {.cls {nrow(covariates)}} row(s).", + sep = "\n" + ) ## no sample names in covariates: create sample names if (is.null(rownames(covariates))) { - warning(name_warning) + cli::cli_warn(name_warning, call = call) if (nrow(counts) != nrow(covariates)) { - stop("Incompatible dimensions") + cli::cli_abort(row_count_abort, call = call) } if (is.null(rownames(counts))) rownames(counts) <- paste0("Sample_", 1:nrow(counts)) rownames(covariates) <- rownames(counts) @@ -27,12 +35,15 @@ common_samples <- function(counts, covariates) { # If name matching is impossible, abort if ## - dimension are incompatible or ## - row (samples) names are conflicting - warning(name_warning) + cli::cli_warn(name_warning, call = call) if (nrow(counts) != nrow(covariates)) { - stop("Incompatible dimensions") + cli::cli_abort(row_count_abort, call = call) } if (!is.null(rownames(counts))) { - stop("Conflicting samples names in count matrix and covariates data frames") + cli::cli_abort(c( + "x" = "Conflicting sample names in {.var counts} matrix and {.var covariates} data frames", + "i" = "Sample names in {.var counts} matrix is {.cls {rownames(counts)}} and in {.var covariates} is {.cls {rownames(covariates)}}." + ), call = call) } rownames(counts) <- rownames(covariates) default_names <- TRUE @@ -44,8 +55,20 @@ common_samples <- function(counts, covariates) { if (sample_are_cols) counts <- t(counts) ## Ensure consistency by using only common samples common_samples <- intersect(rownames(counts), rownames(covariates)) + # Add condition when less counts than covariates information if (length(common_samples) < nrow(counts)) { - message(paste0(nrow(counts) - length(common_samples), " samples were dropped from the abundance matrix for lack of associated covariates.")) + cli::cli_warn(c( + "!" = "There are less samples in {.var counts} than in {.var covariates}.", + "i" = "{.cls {nrow(counts) - length(common_samples)}} samples were dropped from the {.var counts} matrix for lack of associated {.var covariates}.", + "i" = "{.cls There {?is/are} {length(common_samples)}} sample{?s} in the final data.frame." + ), call = call) + } + if (length(common_samples) < nrow(covariates)) { + cli::cli_warn(c( + "!" = "There are less samples in {.var covariates} than in {.var counts}.", + "i" = "{.cls {nrow(covariates) - length(common_samples)}} samples were dropped from {.var covariates} for lack of associated {.var counts}.", + "i" = "{.cls There {?is/are} {length(common_samples)}} sample{?s} in the final data.frame." + ), call = call) } return(list(transpose_counts = sample_are_cols, common_samples = common_samples, @@ -403,6 +426,7 @@ species_variance <- function(counts, groups = rep(1, nrow(counts)), depths_as_of #' @param counts Required. An abundance count table, preferably with dimensions names and species as columns. #' @param covariates Required. A covariates data frame, preferably with row names. #' @param offset Optional. Normalization scheme used to compute scaling factors used as offset during PLN inference. Available schemes are "TSS" (Total Sum Scaling, default), "CSS" (Cumulative Sum Scaling, used in metagenomeSeq), "RLE" (Relative Log Expression, used in DESeq2), "GMPR" (Geometric Mean of Pairwise Ratio, introduced in Chen et al., 2018), Wrench (introduced in Kumar et al., 2018) or "none". Alternatively the user can supply its own vector or matrix of offsets (see note for specification of the user-supplied offsets). +#' @param call Optional. The execution environment in which to set the local error call. #' @param ... Additional parameters passed on to [compute_offset()] #' #' @references Chen, L., Reeve, J., Zhang, L., Huang, S., Wang, X. and Chen, J. (2018) GMPR: A robust normalization method for zero-inflated count data with application to microbiome sequencing data. PeerJ, 6, e4600 \doi{10.7717/peerj.4600} @@ -429,7 +453,7 @@ species_variance <- function(counts, groups = rep(1, nrow(counts)), depths_as_of #' ) #' proper_data$Abundance #' proper_data$Offset -prepare_data <- function(counts, covariates, offset = "TSS", ...) { +prepare_data <- function(counts, covariates, offset = "TSS", call = rlang::caller_env(), ...) { ## Convert counts and covariates to expected format counts <- data.matrix(counts, rownames.force = TRUE) covariates <- as.data.frame(covariates) @@ -448,15 +472,19 @@ prepare_data <- function(counts, covariates, offset = "TSS", ...) { counts <- counts[samples, , drop = FALSE] ## Replace NA with 0s if (any(is.na(counts))) { + cli::cli_warn(c( + "!" = "There is at least one NA value in {.var counts}.", + "i" = "{.cls {sum(is.na(counts))}} NA value{?s} in {.var counts} {?has/have} been replaced with 0." + ), call = call) counts[is.na(counts)] <- 0 - warning("NA values in count table replaced with 0.") } ## filter out empty samples empty_samples <- which(rowSums(counts) == 0) - if (length(empty_samples)) { - warning(paste0("Sample(s) ", - paste(samples[empty_samples], collapse = " and "), - " dropped for lack of positive counts.")) + if (length(empty_samples) > 0) { # Add > 0 + cli::cli_warn(c( + "!" = "There is at least one empty sample in {.var counts}.", + "i" = "{.cls {length(empty_samples)}} sample{?s} ({.cls {samples[empty_samples]}}) in {.var counts} {?has/have} been dropped for lack of positive counts." + ), call = call) samples <- samples[-empty_samples] counts <- counts[samples, ,drop = FALSE] } @@ -511,10 +539,12 @@ prepare_data <- function(counts, covariates, offset = "TSS", ...) { compute_offset <- function(counts, offset = c("TSS", "GMPR", "RLE", "CSS", "Wrench", "TMM", "none"), scale = c("none", "count"), ...) { ## special behavior for data.frame if (inherits(offset, "data.frame")) { - stop( - "You supplied a data.frame to compute_offset(). Did you mean to supply a numeric matrix? - Try converting your data.frame to a matrix with as.matrix()." - ) + cli::cli_abort(c( + "{.var offset} must be an available scheme or a vector or matrix of offsets.", + "x" = "You supplied a data.frame for {.var offset}", + "i" = "Did you mean to supply a numeric matrix?", + "i" = "Try converting your data.frame to a matrix with `as.matrix()`." + )) } ## special behavior for numeric offset if (is.numeric(offset)) { diff --git a/R/plot_utils.R b/R/plot_utils.R index 51df0dbf..f1aa1a0e 100644 --- a/R/plot_utils.R +++ b/R/plot_utils.R @@ -86,6 +86,7 @@ circle <- function(center = c(0, 0), radius = 1, npoints = 100) { return(data.frame(x = xx, y = yy)) } +#' @importFrom scales alpha GeomCircle <- ggplot2::ggproto("GeomCircle", ggplot2::Geom, required_aes = c("x", "y", "radius"), diff --git a/R/utils-zipln.R b/R/utils-zipln.R index 1e2418d3..59e9233a 100644 --- a/R/utils-zipln.R +++ b/R/utils-zipln.R @@ -27,7 +27,7 @@ extract_model_zi <- function(call, envir) { call_args <- c(as.list(call_args), list(xlev = attr(call$formula, "xlevels"), na.action = NULL)) ## Extract terms for ZI and PLN components - terms <- .extract_terms_zi(as.formula(call$formula, env = envir)) + terms <- .extract_terms_zi(as.formula(eval(call$formula, env = envir))) ## eval the call in the parent environment with adjustement due to ZI terms call_args$formula <- terms$formula frame <- do.call(stats::model.frame, call_args, envir = envir) @@ -35,7 +35,7 @@ extract_model_zi <- function(call, envir) { ## Save level for predict function xlevels <- list(PLN = .getXlevels(terms$PLN, frame)) if (!is.null(terms$ZI)) xlevels$ZI = .getXlevels(terms$ZI, frame) - attr(call$formula, "xlevels") <- xlevels + if (!is.null(xlevels$PLN)) attr(call$formula, "xlevels") <- xlevels ## Create the set of matrices to fit the PLN model X <- model.matrix(terms$PLN, frame, xlev = xlevels$PLN) diff --git a/inst/case_studies/oaks_tree.R b/inst/case_studies/oaks_tree.R index c1814b83..e93ac50f 100644 --- a/inst/case_studies/oaks_tree.R +++ b/inst/case_studies/oaks_tree.R @@ -97,14 +97,16 @@ factoextra::fviz_pca_biplot( ## Network inference with sparce covariance estimation -system.time(myZIPLNnets <- ZIPLNnetwork(Abundance ~ 0 + tree + offset(log(Offset)), zi = "single", data = oaks, control = ZIPLNnetwork_param(min_ratio = 0.1))) - system.time(myPLNnets <- PLNnetwork(Abundance ~ 0 + tree + offset(log(Offset)), data = oaks, control = PLNnetwork_param(min_ratio = 0.1))) plot(myPLNnets) plot(getBestModel(myPLNnets, "EBIC")) # stability_selection(myPLNnets) # plot(getBestModel(myPLNnets, "StARS", stability = .975)) +system.time(myZIPLNnets <- ZIPLNnetwork(Abundance ~ 0 + tree + offset(log(Offset)), zi = "single", data = oaks, control = ZIPLNnetwork_param(min_ratio = 0.1))) +plot(myZIPLNnets) +plot(getBestModel(myZIPLNnets, "EBIC")) + ## Mixture model to recover tree structure system.time(my_mixtures <- PLNmixture(Abundance ~ 1 + offset(log(Offset)), data = oaks, clusters = 1:5)) diff --git a/man/barents.Rd b/man/barents.Rd index 5ada292e..83c785d2 100644 --- a/man/barents.Rd +++ b/man/barents.Rd @@ -8,7 +8,7 @@ A data frame with 6 variables: \itemize{ \item Abundance: A 30 fish species by 89 sites count matrix -\item Offset: A 30 fish species by 116 samples offset matrix, measuring the sampling effort in each site +\item Offset: A 30 fish species by 89 samples offset matrix, measuring the sampling effort in each site \item 4 covariates for latitude, longitude, depth (in meters), temperature (in Celsius degrees). } } diff --git a/man/prepare_data.Rd b/man/prepare_data.Rd index db4cd1f4..e9b8b773 100644 --- a/man/prepare_data.Rd +++ b/man/prepare_data.Rd @@ -4,7 +4,13 @@ \alias{prepare_data} \title{Prepare data for use in PLN models} \usage{ -prepare_data(counts, covariates, offset = "TSS", ...) +prepare_data( + counts, + covariates, + offset = "TSS", + call = rlang::caller_env(), + ... +) } \arguments{ \item{counts}{Required. An abundance count table, preferably with dimensions names and species as columns.} @@ -13,6 +19,8 @@ prepare_data(counts, covariates, offset = "TSS", ...) \item{offset}{Optional. Normalization scheme used to compute scaling factors used as offset during PLN inference. Available schemes are "TSS" (Total Sum Scaling, default), "CSS" (Cumulative Sum Scaling, used in metagenomeSeq), "RLE" (Relative Log Expression, used in DESeq2), "GMPR" (Geometric Mean of Pairwise Ratio, introduced in Chen et al., 2018), Wrench (introduced in Kumar et al., 2018) or "none". Alternatively the user can supply its own vector or matrix of offsets (see note for specification of the user-supplied offsets).} +\item{call}{Optional. The execution environment in which to set the local error call.} + \item{...}{Additional parameters passed on to \code{\link[=compute_offset]{compute_offset()}}} } \value{ diff --git a/tests/testthat/test-import-utils.R b/tests/testthat/test-import-utils.R index c4f4bf49..22e93a8e 100644 --- a/tests/testthat/test-import-utils.R +++ b/tests/testthat/test-import-utils.R @@ -48,13 +48,24 @@ test_that("common_samples succeeds on matrices with no or partial dimension name default_names = TRUE)) }) -test_that("common_samples fails on matrices with dimension names but no common samples", { - expect_error( +# test_that("common_samples fails on matrices with dimension names but no common samples", { +# expect_error( +# suppressWarnings( +# common_samples( +# `rownames<-`(counts, paste0("Sample_", 1:49)), +# covariates))#, +# #"Conflicting samples names in count matrix and covariates data frames" +# ) +# }) + +cli::test_that_cli("common_samples fails on matrices with dimension names but conflicting names", { + testthat::local_edition(3) + expect_snapshot( + error = TRUE, suppressWarnings( common_samples( `rownames<-`(counts, paste0("Sample_", 1:49)), - covariates)), - "Conflicting samples names in count matrix and covariates data frames" + covariates)) ) }) @@ -65,9 +76,10 @@ test_that("common_samples succeeds on matrices with dimension names and differen default_names = FALSE)) }) -test_that("common_samples find biggest subset of common samples and produces message.", { - expect_message(result <- common_samples(counts, covariates[1:35, ]), - "14 samples were dropped from the abundance matrix for lack of associated covariates.") +cli::test_that_cli("common_samples find biggest subset of common samples and produces message.", { + testthat::local_edition(3) + expect_snapshot(result <- common_samples(counts, covariates[1:35, ])) + expect_warning(result <- common_samples(counts, covariates[1:35, ])) expect_length(result$common_samples, 35) expect_equal(result$common_samples, as.character(1:35)) @@ -156,14 +168,15 @@ test_that("compute_offset provides correct answers for identical samples", { expect_null(compute_offset(counts, "none")) }) -test_that("compute_offset fails with an informative error when given a data.frame", { +cli::test_that_cli("compute_offset fails with an informative error when given a data.frame", { sizes <- rep(1, 5) counts <- sizes %o% 1:10 - - expect_error(compute_offset(counts, data.frame(counts)), - regexp = "You supplied a data.frame to compute_offset(). Did you mean to supply a numeric matrix? - Try converting your data.frame to a matrix with as.matrix().", - fixed = TRUE) + testthat::local_edition(3) + testthat::expect_snapshot(error = TRUE, + compute_offset(counts, data.frame(counts))) +# regexp = "You supplied a data.frame to compute_offset(). Did you mean to supply a numeric matrix? +# Try converting your data.frame to a matrix with as.matrix().", +# fixed = TRUE) }) @@ -369,21 +382,22 @@ test_that("species_variance works as intented", { test_that("prepare_data drops samples with no positive counts", { counts[1:2, ] <- 0 - expect_warning(result <- prepare_data(counts, covariates, offset = "none"), - "Sample(s) 1 and 2 dropped for lack of positive counts.", - fixed = TRUE + expect_warning(result <- prepare_data(counts, covariates, offset = "none")#, + # "Sample(s) 1 and 2 dropped for lack of positive counts.", + # fixed = TRUE ) expect_equal(dim(result), c(nrow(counts)-2, ncol(covariates)+1)) expect_equal(dim(result$Abundance), c(nrow(counts)-2, ncol(counts))) }) -test_that("prepare_data replace NA with 0s", { - counts[1:2, 1] <- NA - expect_warning(result <- prepare_data(counts, covariates, offset = "none"), - "NA values in count table replaced with 0.") - expect_equal(dim(result), c(nrow(counts), ncol(covariates)+1)) - expect_equal(dim(result$Abundance), c(nrow(counts), ncol(counts))) -}) + test_that("prepare_data replace NA with 0s", { + counts[1:2, 1] <- NA + expect_warning(result <- prepare_data(counts, covariates, offset = "none")) + #, +# "NA values in count table replaced with 0.") + expect_equal(dim(result), c(nrow(counts), ncol(covariates)+1)) + expect_equal(dim(result$Abundance), c(nrow(counts), ncol(counts))) + }) result <- data.frame( Abundance = NA,