From 1ef609162089418cfa45f14e6a3173caeed022ee Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 12 Dec 2024 14:49:59 -0800 Subject: [PATCH 1/4] [r] add tf-idf and log normalization functions --- r/NAMESPACE | 2 + r/NEWS.md | 1 + r/R/transforms.R | 54 +++++++++++++++++++++++ r/man/normalize_log.Rd | 21 +++++++++ r/man/normalize_tfidf.Rd | 21 +++++++++ r/pkgdown/_pkgdown.yml | 2 + r/tests/testthat/test-matrix_transforms.R | 46 +++++++++++++++++++ 7 files changed, 147 insertions(+) create mode 100644 r/man/normalize_log.Rd create mode 100644 r/man/normalize_tfidf.Rd diff --git a/r/NAMESPACE b/r/NAMESPACE index 8625c143..622d3a88 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -65,6 +65,8 @@ export(min_by_row) export(min_scalar) export(multiply_cols) export(multiply_rows) +export(normalize_log) +export(normalize_tfidf) export(nucleosome_counts) export(open_fragments_10x) export(open_fragments_dir) diff --git a/r/NEWS.md b/r/NEWS.md index 9715c95a..890f2755 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -22,6 +22,7 @@ Contributions welcome :) - Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices. If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128) - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128) +- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168) ## Improvements - `trackplot_loop()` now accepts discrete color scales diff --git a/r/R/transforms.R b/r/R/transforms.R index 2a5de994..7d1c46f8 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -923,3 +923,57 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { vars_to_regress = vars_to_regress ) } + +################# +# Normalizations +################# + +#' Normalize a matrix using log normalization +#' @param mat (IterableMatrix) Matrix to normalize +#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization +#' @param add_one (logical) Add one to the matrix before log normalization +#' @returns log normalized matrix. +#' @export +normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE) { + assert_is(mat, "IterableMatrix") + assert_is_numeric(scale_factor) + assert_true(is.logical(add_one)) + assert_greater_than_zero(scale_factor) + mat <- mat * scale_factor + if (!add_one) mat <- mat - 1 + return(log1p(mat)) +} + + +#' Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency +#' @param mat (IterableMatrix) to normalize +#' @param feature_means (numeric) Means of the features to normalize by. If no names are provided, then +#' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. +#' Else, map each feature name to its mean value. +#' @returns tf-idf normalized matrix. +#' @export +normalize_tfidf <- function(mat, feature_means = NULL, threads = 1L) { + assert_is(mat, "IterableMatrix") + assert_is_wholenumber(threads) + # If feature means are passed in, only need to calculate term frequency + if (is.null(feature_means)) { + mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean")) + feature_means <- mat_stats$row_stats["mean", ] + read_depth <- mat_stats$col_stats["mean", ] * nrow(mat) + } else { + assert_is_numeric(feature_means) + if (!is.null(names(feature_means)) && !is.null(rownames(mat))) { + # Make sure every name in feature means exists in rownames(mat) + # In the case there is a length mismatch but the feature names all exist in feature_means + # will not error out + assert_true(all(rownames(mat) %in% names(feature_means))) + feature_means <- feature_means[rownames(mat)] + } else { + assert_len(feature_means, nrow(mat)) + } + read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean",] * nrow(mat) + } + tf <- mat %>% multiply_cols(1 / read_depth) + idf <- 1 / feature_means + return(tf %>% multiply_rows(idf)) +} \ No newline at end of file diff --git a/r/man/normalize_log.Rd b/r/man/normalize_log.Rd new file mode 100644 index 00000000..90a57f85 --- /dev/null +++ b/r/man/normalize_log.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transforms.R +\name{normalize_log} +\alias{normalize_log} +\title{Normalize a matrix using log normalization} +\usage{ +normalize_log(mat, scale_factor = 10000, add_one = TRUE) +} +\arguments{ +\item{mat}{(IterableMatrix) Matrix to normalize} + +\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization} + +\item{add_one}{(logical) Add one to the matrix before log normalization} +} +\value{ +log normalized matrix. +} +\description{ +Normalize a matrix using log normalization +} diff --git a/r/man/normalize_tfidf.Rd b/r/man/normalize_tfidf.Rd new file mode 100644 index 00000000..bf6a34ef --- /dev/null +++ b/r/man/normalize_tfidf.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transforms.R +\name{normalize_tfidf} +\alias{normalize_tfidf} +\title{Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency} +\usage{ +normalize_tfidf(mat, feature_means = NULL, threads = 1L) +} +\arguments{ +\item{mat}{(IterableMatrix) to normalize} + +\item{feature_means}{(numeric) Means of the features to normalize by. If no names are provided, then +each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. +Else, map each feature name to its mean value.} +} +\value{ +tf-idf normalized matrix. +} +\description{ +Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency +} diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index ea73ec01..22237598 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -126,6 +126,8 @@ reference: - checksum - apply_by_row - regress_out + - normalize_log + - normalize_tfidf - IterableMatrix-methods - pseudobulk_matrix diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R index 385605e0..b1941f8b 100644 --- a/r/tests/testthat/test-matrix_transforms.R +++ b/r/tests/testthat/test-matrix_transforms.R @@ -346,3 +346,49 @@ test_that("linear regression works", { expect_equal(as(m1, "matrix"), ans) expect_equal(as(m1t, "matrix"), ans) }) + +test_that("tf-idf normalization works", { + m <- generate_sparse_matrix(5, 5) + rownames(m) <- paste0("row", seq_len(nrow(m))) + rev_rownames <- rev(rownames(m)) + # Create tf-idf normalization for dgCMatrix + res_dgc <- diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix") + + rownames(res_dgc) <- rownames(m) + m2 <- as(m, "IterableMatrix") + # Check that we can pass in row means as a (named) vector + row_means <- matrix_stats(m2, row_stats = c("mean"))$row_stats["mean",] + # Test that row means ordering does not matter as long as names exist + row_means_shuffled <- row_means[sample(1:length(row_means))] + # Test that row means can have an extra element as long as all rownames are in the vector + row_means_plus_one <- c(row_means, row6 = 1) + + + res <- normalize_tfidf(m2) + expect_equal(res %>% as("dgCMatrix"), res_dgc) + res_with_row_means <- normalize_tfidf(m2, feature_means = row_means) + expect_identical(res, res_with_row_means) + + res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled) + expect_identical(res_with_row_means, res_with_shuffled_row_means, res) + + res_with_row_means_with_extra_element <- normalize_tfidf(m2, feature_means = row_means_plus_one) + expect_identical(res, res_with_row_means_with_extra_element) +}) + +test_that("normalize_log works", { + m <- generate_sparse_matrix(5, 5) + m2 <- as(m, "IterableMatrix") + # Test that default params yield the same as log1p on dgCMatrix + res_1 <- as(normalize_log(m2), "dgCMatrix") + expect_equal(res_1, log1p(m*1e4)) + + # Test that changing scale factor works + res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix") + expect_identical(res_2, log1p(m*1e5)) + # Test that removing the add_one works + # log of 0 is -inf, but we don't do that on the c side, and just have really large negative numbers. + res_3 <- as(normalize_log(m2, add_one = FALSE), "dgCMatrix") + res_3@x[res_3@x < -700] <- -Inf + expect_identical(as(res_3, "dgeMatrix"), log(m*1e4)) +}) \ No newline at end of file From 98675d0186f2fa7f016543a9f454c3578a5260ef Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Thu, 12 Dec 2024 15:13:11 -0800 Subject: [PATCH 2/4] [r] fix normalization tests --- r/tests/testthat/test-matrix_transforms.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R index b1941f8b..06bc3337 100644 --- a/r/tests/testthat/test-matrix_transforms.R +++ b/r/tests/testthat/test-matrix_transforms.R @@ -381,14 +381,14 @@ test_that("normalize_log works", { m2 <- as(m, "IterableMatrix") # Test that default params yield the same as log1p on dgCMatrix res_1 <- as(normalize_log(m2), "dgCMatrix") - expect_equal(res_1, log1p(m*1e4)) + expect_equal(res_1, log1p(m*1e4), tolerance = 1e-6) # Test that changing scale factor works res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix") - expect_identical(res_2, log1p(m*1e5)) + expect_equal(res_2, log1p(m*1e5), tolerance = 1e-6) # Test that removing the add_one works # log of 0 is -inf, but we don't do that on the c side, and just have really large negative numbers. res_3 <- as(normalize_log(m2, add_one = FALSE), "dgCMatrix") - res_3@x[res_3@x < -700] <- -Inf - expect_identical(as(res_3, "dgeMatrix"), log(m*1e4)) + res_3@x[res_3@x < -60] <- -Inf + expect_equal(as(res_3, "dgeMatrix"), log(m*1e4), tolerance = 1e-6) }) \ No newline at end of file From 2f83ae647174ea718f0c05505487ab2ebb55393c Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Fri, 13 Dec 2024 17:09:56 -0800 Subject: [PATCH 3/4] [r] add in requested changes --- r/R/transforms.R | 38 ++++++++++++++--------- r/man/normalize_log.Rd | 16 +++++----- r/man/normalize_tfidf.Rd | 16 +++++++--- r/pkgdown/_pkgdown.yml | 5 +++ r/tests/testthat/test-matrix_transforms.R | 12 +++---- 5 files changed, 52 insertions(+), 35 deletions(-) diff --git a/r/R/transforms.R b/r/R/transforms.R index 7d1c46f8..b2dda267 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -928,31 +928,38 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { # Normalizations ################# -#' Normalize a matrix using log normalization -#' @param mat (IterableMatrix) Matrix to normalize -#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization -#' @param add_one (logical) Add one to the matrix before log normalization -#' @returns log normalized matrix. +#' Normalize a `(features x cells)` matrix using log normalization. +#' @param mat (IterableMatrix) Matrix to normalize. +#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization. +#' @param threads (integer) Number of threads to use.s +#' @returns log normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +#' the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: +#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} #' @export -normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE) { +normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE, threads = 1L) { assert_is(mat, "IterableMatrix") assert_is_numeric(scale_factor) assert_true(is.logical(add_one)) assert_greater_than_zero(scale_factor) - mat <- mat * scale_factor - if (!add_one) mat <- mat - 1 - return(log1p(mat)) + read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat) + mat <- mat %>% multiply_cols(1 / read_depth) + return(log1p(mat * scale_factor)) } -#' Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency -#' @param mat (IterableMatrix) to normalize +#' Normalize a `(features x cells)` matrix using term frequency-inverse document frequency. #' @param feature_means (numeric) Means of the features to normalize by. If no names are provided, then #' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. #' Else, map each feature name to its mean value. -#' @returns tf-idf normalized matrix. +#' @returns tf-idf normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +#' the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: +#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} +#' @inheritParams normalize_log #' @export -normalize_tfidf <- function(mat, feature_means = NULL, threads = 1L) { +normalize_tfidf <- function( + mat, feature_means = NULL, + scale_factor = 1e4, threads = 1L +) { assert_is(mat, "IterableMatrix") assert_is_wholenumber(threads) # If feature means are passed in, only need to calculate term frequency @@ -971,9 +978,10 @@ normalize_tfidf <- function(mat, feature_means = NULL, threads = 1L) { } else { assert_len(feature_means, nrow(mat)) } - read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean",] * nrow(mat) + read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat) } tf <- mat %>% multiply_cols(1 / read_depth) idf <- 1 / feature_means - return(tf %>% multiply_rows(idf)) + tf_idf_mat <- tf %>% multiply_rows(idf) + return(log1p(tf_idf_mat * scale_factor)) } \ No newline at end of file diff --git a/r/man/normalize_log.Rd b/r/man/normalize_log.Rd index 90a57f85..9de93071 100644 --- a/r/man/normalize_log.Rd +++ b/r/man/normalize_log.Rd @@ -2,20 +2,22 @@ % Please edit documentation in R/transforms.R \name{normalize_log} \alias{normalize_log} -\title{Normalize a matrix using log normalization} +\title{Normalize a \verb{(features x cells)} matrix using log normalization.} \usage{ -normalize_log(mat, scale_factor = 10000, add_one = TRUE) +normalize_log(mat, scale_factor = 10000, add_one = TRUE, threads = 1L) } \arguments{ -\item{mat}{(IterableMatrix) Matrix to normalize} +\item{mat}{(IterableMatrix) Matrix to normalize.} -\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization} +\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.} -\item{add_one}{(logical) Add one to the matrix before log normalization} +\item{threads}{(integer) Number of threads to use.s} } \value{ -log normalized matrix. +log normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: +\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} } \description{ -Normalize a matrix using log normalization +Normalize a \verb{(features x cells)} matrix using log normalization. } diff --git a/r/man/normalize_tfidf.Rd b/r/man/normalize_tfidf.Rd index bf6a34ef..8dc50b84 100644 --- a/r/man/normalize_tfidf.Rd +++ b/r/man/normalize_tfidf.Rd @@ -2,20 +2,26 @@ % Please edit documentation in R/transforms.R \name{normalize_tfidf} \alias{normalize_tfidf} -\title{Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency} +\title{Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency.} \usage{ -normalize_tfidf(mat, feature_means = NULL, threads = 1L) +normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L) } \arguments{ -\item{mat}{(IterableMatrix) to normalize} +\item{mat}{(IterableMatrix) Matrix to normalize.} \item{feature_means}{(numeric) Means of the features to normalize by. If no names are provided, then each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix. Else, map each feature name to its mean value.} + +\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.} + +\item{threads}{(integer) Number of threads to use.s} } \value{ -tf-idf normalized matrix. +tf-idf normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, +the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: +\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)} } \description{ -Normalize a `(features x cells)`` matrix using term frequency-inverse document frequency +Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency. } diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml index 22237598..ae2772eb 100644 --- a/r/pkgdown/_pkgdown.yml +++ b/r/pkgdown/_pkgdown.yml @@ -9,6 +9,11 @@ template: includes: in_header: | + + + + + after_body: | diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R index 06bc3337..c4c5edb5 100644 --- a/r/tests/testthat/test-matrix_transforms.R +++ b/r/tests/testthat/test-matrix_transforms.R @@ -352,7 +352,7 @@ test_that("tf-idf normalization works", { rownames(m) <- paste0("row", seq_len(nrow(m))) rev_rownames <- rev(rownames(m)) # Create tf-idf normalization for dgCMatrix - res_dgc <- diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix") + res_dgc <- log1p((diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix")) * 1e4) rownames(res_dgc) <- rownames(m) m2 <- as(m, "IterableMatrix") @@ -378,17 +378,13 @@ test_that("tf-idf normalization works", { test_that("normalize_log works", { m <- generate_sparse_matrix(5, 5) + res_dgc <- m %*% diag(1/colSums(m)) %>% as("dgCMatrix") m2 <- as(m, "IterableMatrix") # Test that default params yield the same as log1p on dgCMatrix res_1 <- as(normalize_log(m2), "dgCMatrix") - expect_equal(res_1, log1p(m*1e4), tolerance = 1e-6) + expect_equal(res_1, log1p(res_dgc*1e4), tolerance = 1e-6) # Test that changing scale factor works res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix") - expect_equal(res_2, log1p(m*1e5), tolerance = 1e-6) - # Test that removing the add_one works - # log of 0 is -inf, but we don't do that on the c side, and just have really large negative numbers. - res_3 <- as(normalize_log(m2, add_one = FALSE), "dgCMatrix") - res_3@x[res_3@x < -60] <- -Inf - expect_equal(as(res_3, "dgeMatrix"), log(m*1e4), tolerance = 1e-6) + expect_equal(res_2, log1p(res_dgc*1e5), tolerance = 1e-6) }) \ No newline at end of file From 6381f74d02982c7972198aa32a042a99b0808069 Mon Sep 17 00:00:00 2001 From: immanuelazn Date: Fri, 13 Dec 2024 17:15:56 -0800 Subject: [PATCH 4/4] [r] removed unused variable --- r/R/transforms.R | 3 +-- r/man/normalize_log.Rd | 2 +- r/tests/testthat/test-matrix_transforms.R | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/r/R/transforms.R b/r/R/transforms.R index b2dda267..8a2bd25b 100644 --- a/r/R/transforms.R +++ b/r/R/transforms.R @@ -936,10 +936,9 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) { #' the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as: #' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)} #' @export -normalize_log <- function(mat, scale_factor = 1e4, add_one = TRUE, threads = 1L) { +normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) { assert_is(mat, "IterableMatrix") assert_is_numeric(scale_factor) - assert_true(is.logical(add_one)) assert_greater_than_zero(scale_factor) read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat) mat <- mat %>% multiply_cols(1 / read_depth) diff --git a/r/man/normalize_log.Rd b/r/man/normalize_log.Rd index 9de93071..97d8c92e 100644 --- a/r/man/normalize_log.Rd +++ b/r/man/normalize_log.Rd @@ -4,7 +4,7 @@ \alias{normalize_log} \title{Normalize a \verb{(features x cells)} matrix using log normalization.} \usage{ -normalize_log(mat, scale_factor = 10000, add_one = TRUE, threads = 1L) +normalize_log(mat, scale_factor = 10000, threads = 1L) } \arguments{ \item{mat}{(IterableMatrix) Matrix to normalize.} diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R index c4c5edb5..67641e54 100644 --- a/r/tests/testthat/test-matrix_transforms.R +++ b/r/tests/testthat/test-matrix_transforms.R @@ -365,7 +365,7 @@ test_that("tf-idf normalization works", { res <- normalize_tfidf(m2) - expect_equal(res %>% as("dgCMatrix"), res_dgc) + expect_equal(res %>% as("dgCMatrix"), res_dgc, tolerance = 1e-6) res_with_row_means <- normalize_tfidf(m2, feature_means = row_means) expect_identical(res, res_with_row_means)