Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[r] add tf-idf and log normalization functions #168

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ export(min_by_row)
export(min_scalar)
export(multiply_cols)
export(multiply_rows)
export(normalize_log)
export(normalize_tfidf)
export(nucleosome_counts)
export(open_fragments_10x)
export(open_fragments_dir)
Expand Down
1 change: 1 addition & 0 deletions r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Contributions welcome :)
- Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices.
If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128)
- Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)

## Improvements
- `trackplot_loop()` now accepts discrete color scales
Expand Down
61 changes: 61 additions & 0 deletions r/R/transforms.R
Original file line number Diff line number Diff line change
Expand Up @@ -923,3 +923,64 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
vars_to_regress = vars_to_regress
)
}

#################
# Normalizations
#################

#' Normalize a `(features x cells)` matrix using log normalization.
#' @param mat (IterableMatrix) Matrix to normalize.
#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization.
#' @param threads (integer) Number of threads to use.s
#' @returns log normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
#' the log normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
#' @export
normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
assert_is(mat, "IterableMatrix")
assert_is_numeric(scale_factor)
assert_greater_than_zero(scale_factor)
read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
mat <- mat %>% multiply_cols(1 / read_depth)
return(log1p(mat * scale_factor))
}


#' Normalize a `(features x cells)` matrix using term frequency-inverse document frequency.
#' @param feature_means (numeric) Means of the features to normalize by. If no names are provided, then
#' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
#' Else, map each feature name to its mean value.
#' @returns tf-idf normalized matrix. For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells,
#' the tf-idf normalization of that element, \eqn{\tilde{x}_{ij}} is calculated as:
#' \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
#' @inheritParams normalize_log
#' @export
normalize_tfidf <- function(
mat, feature_means = NULL,
scale_factor = 1e4, threads = 1L
) {
assert_is(mat, "IterableMatrix")
assert_is_wholenumber(threads)
# If feature means are passed in, only need to calculate term frequency
if (is.null(feature_means)) {
mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
feature_means <- mat_stats$row_stats["mean", ]
read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
} else {
assert_is_numeric(feature_means)
if (!is.null(names(feature_means)) && !is.null(rownames(mat))) {
# Make sure every name in feature means exists in rownames(mat)
# In the case there is a length mismatch but the feature names all exist in feature_means
# will not error out
assert_true(all(rownames(mat) %in% names(feature_means)))
feature_means <- feature_means[rownames(mat)]
} else {
assert_len(feature_means, nrow(mat))
}
read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
}
tf <- mat %>% multiply_cols(1 / read_depth)
idf <- 1 / feature_means
tf_idf_mat <- tf %>% multiply_rows(idf)
return(log1p(tf_idf_mat * scale_factor))
}
23 changes: 23 additions & 0 deletions r/man/normalize_log.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions r/man/normalize_tfidf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions r/pkgdown/_pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ template:
includes:
in_header: |
<script defer data-domain="benparks.net" src="https://plausible.benparks.net/js/visit-counts.js"></script>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.15/dist/katex.min.css" integrity="sha384-Htz9HMhiwV8GuQ28Xr9pEs1B4qJiYu/nYLLwlDklR53QibDfmQzi7rYxXhMH/5/u" crossorigin="anonymous">
<!-- The loading of KaTeX is deferred to speed up page rendering -->
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.15/dist/katex.min.js" integrity="sha384-bxmi2jLGCvnsEqMuYLKE/KsVCxV3PqmKeK6Y6+lmNXBry6+luFkEOsmp5vD9I/7+" crossorigin="anonymous"></script>
<!-- To automatically render math in text elements, include the auto-render extension: -->
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.15/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous" onload="renderMathInElement(document.body);"></script>
after_body: |
<img src="https://plausible.benparks.net/flask-plausible/bpcells-docs.png" style="position:absolute;" />

Expand Down Expand Up @@ -126,6 +131,8 @@ reference:
- checksum
- apply_by_row
- regress_out
- normalize_log
- normalize_tfidf
- IterableMatrix-methods
- pseudobulk_matrix

Expand Down
42 changes: 42 additions & 0 deletions r/tests/testthat/test-matrix_transforms.R
Original file line number Diff line number Diff line change
Expand Up @@ -346,3 +346,45 @@ test_that("linear regression works", {
expect_equal(as(m1, "matrix"), ans)
expect_equal(as(m1t, "matrix"), ans)
})

test_that("tf-idf normalization works", {
m <- generate_sparse_matrix(5, 5)
rownames(m) <- paste0("row", seq_len(nrow(m)))
rev_rownames <- rev(rownames(m))
# Create tf-idf normalization for dgCMatrix
res_dgc <- log1p((diag(1/rowMeans(m)) %*% (m %*% diag(1/colSums(m))) %>% as("dgCMatrix")) * 1e4)

rownames(res_dgc) <- rownames(m)
m2 <- as(m, "IterableMatrix")
# Check that we can pass in row means as a (named) vector
row_means <- matrix_stats(m2, row_stats = c("mean"))$row_stats["mean",]
# Test that row means ordering does not matter as long as names exist
row_means_shuffled <- row_means[sample(1:length(row_means))]
# Test that row means can have an extra element as long as all rownames are in the vector
row_means_plus_one <- c(row_means, row6 = 1)


res <- normalize_tfidf(m2)
expect_equal(res %>% as("dgCMatrix"), res_dgc, tolerance = 1e-6)
res_with_row_means <- normalize_tfidf(m2, feature_means = row_means)
expect_identical(res, res_with_row_means)

res_with_shuffled_row_means <- normalize_tfidf(m2, feature_means = row_means_shuffled)
expect_identical(res_with_row_means, res_with_shuffled_row_means, res)

res_with_row_means_with_extra_element <- normalize_tfidf(m2, feature_means = row_means_plus_one)
expect_identical(res, res_with_row_means_with_extra_element)
})

test_that("normalize_log works", {
m <- generate_sparse_matrix(5, 5)
res_dgc <- m %*% diag(1/colSums(m)) %>% as("dgCMatrix")
m2 <- as(m, "IterableMatrix")
# Test that default params yield the same as log1p on dgCMatrix
res_1 <- as(normalize_log(m2), "dgCMatrix")
expect_equal(res_1, log1p(res_dgc*1e4), tolerance = 1e-6)

# Test that changing scale factor works
res_2 <- as(normalize_log(m2, scale_factor = 1e5), "dgCMatrix")
expect_equal(res_2, log1p(res_dgc*1e5), tolerance = 1e-6)
})
Loading