diff --git a/r/NAMESPACE b/r/NAMESPACE
index 8625c143..622d3a88 100644
--- a/r/NAMESPACE
+++ b/r/NAMESPACE
@@ -65,6 +65,8 @@ export(min_by_row)
export(min_scalar)
export(multiply_cols)
export(multiply_rows)
+export(normalize_log)
+export(normalize_tfidf)
export(nucleosome_counts)
export(open_fragments_10x)
export(open_fragments_dir)
diff --git a/r/NEWS.md b/r/NEWS.md
index 9715c95a..890f2755 100644
--- a/r/NEWS.md
+++ b/r/NEWS.md
@@ -22,6 +22,7 @@ Contributions welcome :)
- Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices.
If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128)
- Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
+- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)
## Improvements
- `trackplot_loop()` now accepts discrete color scales
diff --git a/r/R/transforms.R b/r/R/transforms.R
index 2a5de994..8a2bd25b 100644
--- a/r/R/transforms.R
+++ b/r/R/transforms.R
@@ -923,3 +923,64 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
vars_to_regress = vars_to_regress
)
}
+
+#################
+# Normalizations
+#################
+
+#' Normalize a `(features x cells)` matrix using log normalization.
+#' @param mat (IterableMatrix) Matrix to normalize.
+#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization.
+#' @param threads (integer) Number of threads to use.s
+#' @returns log normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
+#' the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
+#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
+#' @export
+normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
+ assert_is(mat, "IterableMatrix")
+ assert_is_numeric(scale_factor)
+ assert_greater_than_zero(scale_factor)
+ read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
+ mat <- mat %>% multiply_cols(1 / read_depth)
+ return(log1p(mat * scale_factor))
+}
+
+
+#' Normalize a `(features x cells)` matrix using term frequency-inverse document frequency.
+#' @param feature_means (numeric) Means of the features to normalize by. If no names are provided, then
+#' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
+#' Else, map each feature name to its mean value.
+#' @returns tf-idf normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
+#' the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
+#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
+#' @inheritParams normalize_log
+#' @export
+normalize_tfidf <- function(
+ mat, feature_means = NULL,
+ scale_factor = 1e4, threads = 1L
+) {
+ assert_is(mat, "IterableMatrix")
+ assert_is_wholenumber(threads)
+ # If feature means are passed in, only need to calculate term frequency
+ if (is.null(feature_means)) {
+ mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
+ feature_means <- mat_stats$row_stats["mean", ]
+ read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
+ } else {
+ assert_is_numeric(feature_means)
+ if (!is.null(names(feature_means)) && !is.null(rownames(mat))) {
+ # Make sure every name in feature means exists in rownames(mat)
+ # In the case there is a length mismatch but the feature names all exist in feature_means
+ # will not error out
+ assert_true(all(rownames(mat) %in% names(feature_means)))
+ feature_means <- feature_means[rownames(mat)]
+ } else {
+ assert_len(feature_means, nrow(mat))
+ }
+ read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
+ }
+ tf <- mat %>% multiply_cols(1 / read_depth)
+ idf <- 1 / feature_means
+ tf_idf_mat <- tf %>% multiply_rows(idf)
+ return(log1p(tf_idf_mat * scale_factor))
+}
\ No newline at end of file
diff --git a/r/man/normalize_log.Rd b/r/man/normalize_log.Rd
new file mode 100644
index 00000000..97d8c92e
--- /dev/null
+++ b/r/man/normalize_log.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transforms.R
+\name{normalize_log}
+\alias{normalize_log}
+\title{Normalize a \verb{(features x cells)} matrix using log normalization.}
+\usage{
+normalize_log(mat, scale_factor = 10000, threads = 1L)
+}
+\arguments{
+\item{mat}{(IterableMatrix) Matrix to normalize.}
+
+\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.}
+
+\item{threads}{(integer) Number of threads to use.s}
+}
+\value{
+log normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
+the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
+\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
+}
+\description{
+Normalize a \verb{(features x cells)} matrix using log normalization.
+}
diff --git a/r/man/normalize_tfidf.Rd b/r/man/normalize_tfidf.Rd
new file mode 100644
index 00000000..8dc50b84
--- /dev/null
+++ b/r/man/normalize_tfidf.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transforms.R
+\name{normalize_tfidf}
+\alias{normalize_tfidf}
+\title{Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency.}
+\usage{
+normalize_tfidf(mat, feature_means = NULL, scale_factor = 10000, threads = 1L)
+}
+\arguments{
+\item{mat}{(IterableMatrix) Matrix to normalize.}
+
+\item{feature_means}{(numeric) Means of the features to normalize by. If no names are provided, then
+each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
+Else, map each feature name to its mean value.}
+
+\item{scale_factor}{(numeric) Scale factor to multiply matrix by for log normalization.}
+
+\item{threads}{(integer) Number of threads to use.s}
+}
+\value{
+tf-idf normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
+the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
+\eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
+}
+\description{
+Normalize a \verb{(features x cells)} matrix using term frequency-inverse document frequency.
+}
diff --git a/r/pkgdown/_pkgdown.yml b/r/pkgdown/_pkgdown.yml
index ea73ec01..ae2772eb 100644
--- a/r/pkgdown/_pkgdown.yml
+++ b/r/pkgdown/_pkgdown.yml
@@ -9,6 +9,11 @@ template:
includes:
in_header: |
+
+
+
+
+
after_body: |
@@ -126,6 +131,8 @@ reference:
- checksum
- apply_by_row
- regress_out
+ - normalize_log
+ - normalize_tfidf
- IterableMatrix-methods
- pseudobulk_matrix
diff --git a/r/tests/testthat/test-matrix_transforms.R b/r/tests/testthat/test-matrix_transforms.R
index 385605e0..67641e54 100644
--- a/r/tests/testthat/test-matrix_transforms.R
+++ b/r/tests/testthat/test-matrix_transforms.R
@@ -346,3 +346,45 @@ test_that("linear regression works", {
expect_equal(as(m1, "matrix"), ans)
expect_equal(as(m1t, "matrix"), ans)
})
+
+test_that("tf-idf normalization works", {
+ m <- generate_sparse_matrix(5, 5)
+ rownames(m) <- paste0("row", seq_len(nrow(m)))
+ rev_rownames <- rev(rownames(m))
+ # Create tf-idf normalization for dgCMatrix
+ res_dgc <- log1p((diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix")) * 1e4)
+
+ rownames(res_dgc) <- rownames(m)
+ m2 <- as(m, "IterableMatrix")
+ # Check that we can pass in row means as a (named) vector
+ row_means <- matrix_stats(m2, row_stats = c("mean"))$row_stats["mean",]
+ # Test that row means ordering does not matter as long as names exist
+ row_means_shuffled <- row_means[sample(1:length(row_means))]
+ # Test that row means can have an extra element as long as all rownames are in the vector
+ row_means_plus_one <- c(row_means, row6 = 1)
+
+
+ res <- normalize_tfidf(m2)
+ expect_equal(res %>% as("dgCMatrix"), res_dgc, tolerance = 1e-6)
+ res_with_row_means <- normalize_tfidf(m2, feature_means = row_means)
+ expect_identical(res, res_with_row_means)
+
+ res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled)
+ expect_identical(res_with_row_means, res_with_shuffled_row_means, res)
+
+ res_with_row_means_with_extra_element <- normalize_tfidf(m2, feature_means = row_means_plus_one)
+ expect_identical(res, res_with_row_means_with_extra_element)
+})
+
+test_that("normalize_log works", {
+ m <- generate_sparse_matrix(5, 5)
+ res_dgc <- m %*% diag(1/colSums(m)) %>% as("dgCMatrix")
+ m2 <- as(m, "IterableMatrix")
+ # Test that default params yield the same as log1p on dgCMatrix
+ res_1 <- as(normalize_log(m2), "dgCMatrix")
+ expect_equal(res_1, log1p(res_dgc*1e4), tolerance = 1e-6)
+
+ # Test that changing scale factor works
+ res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix")
+ expect_equal(res_2, log1p(res_dgc*1e5), tolerance = 1e-6)
+})
\ No newline at end of file