bnprks · immanuelazn · Dec 12, 2024 · Dec 12, 2024 · Dec 14, 2024 · Dec 14, 2024
diff --git a/r/NAMESPACE b/r/NAMESPACE
@@ -65,6 +65,8 @@ export(min_by_row)
 export(min_scalar)
 export(multiply_cols)
 export(multiply_rows)
+export(normalize_log)
+export(normalize_tfidf)
 export(nucleosome_counts)
 export(open_fragments_10x)
 export(open_fragments_dir)
@@ -108,6 +110,8 @@ export(rowVars.default)
 export(sctransform_pearson)
 export(select_cells)
 export(select_chromosomes)
+export(select_features_mean)
+export(select_features_variance)
 export(select_regions)
 export(set_trackplot_height)
 export(set_trackplot_label)

diff --git a/r/NEWS.md b/r/NEWS.md
@@ -10,6 +10,7 @@ Contributions welcome :)
 
 ## Features
 - Add `write_matrix_anndata_hdf5_dense()` which allows writing matrices in AnnData's dense format, most commonly used for `obsm` or `varm` matrices. (Thanks to @ycli1995 for pull request #166)
+- Add normalization helper functions `normalize_log()` and `normalize_tfidf()` (pull request #168)
 
 ## Bug-fixes
 - Fix error message printing when MACS crashes during `call_peaks_macs()` (pull request #175) 
@@ -39,6 +40,8 @@ of the new features this release and will continue to help with maintenance and
 - Add `rowQuantiles()` and `colQuantiles()` functions, which return the quantiles of each row/column of a matrix. Currently `rowQuantiles()` only works on row-major matrices and `colQuantiles()` only works on col-major matrices.
   If `matrixStats` or `MatrixGenerics` packages are installed, `BPCells::colQuantiles()` will fall back to their implementations for non-BPCells objects. (pull request #128)
 - Add `pseudobulk_matrix()` which allows pseudobulk aggregation by `sum` or `mean` and calculation of per-pseudobulk `variance` and `nonzero` statistics for each gene (pull request #128)
+- Add functions `normalize_tfidf()` and `normalize_log()`, which allow for easy normalization of iterable matrices using TF-IDF or log1p(pull request #168)
+- Add feature selection functions `select_features_by_{variance,dispersion,mean}()`, with parameterization for normalization steps, and number of variable features (pull request #169)
 
 ## Improvements
 - `trackplot_loop()` now accepts discrete color scales

diff --git a/r/R/singlecell_utils.R b/r/R/singlecell_utils.R
@@ -6,6 +6,146 @@
 # option. This file may not be copied, modified, or distributed
 # except according to those terms.
 
+
+#################
+# Feature selection
+#################
+
+#' Feature selection functions
+#' 
+#' Apply a feature selection method to a `(features x cells)` matrix.
+#' @rdname feature_selection
+#' @param mat (IterableMatrix) dimensions features x cells
+#' @param num_feats (float) Number of features to return.  If the number given is between 0 and 1, treat as a proportion of 
+#' the number of rows, rounded down.  Otherwise, treat as an absolute number.
+#' If the number is higher than the number of features in the matrix, 
+#' all features will be returned.
+#' @param normalize (function) Normalize matrix using a given function. If `NULL`, no normalization is performed.
+#' @param threads (integer) Number of threads to use.
+#' @returns
+#' Return a dataframe with the following columns, sorted descending by score:
+#' - `names`: Feature name.
+#' - `score`: Scoring of the feature, depending on the method used.  
+#' - `highly_variable`: Logical vector of whether the feature is highly variable.
+#'
+#' Each different feature selection method will have a different scoring method:
+#' - `select_features_by_variance`: Score representing variance of each feature.
+#' @details 
+#' `select_features_by_variance` Calculates the variance of each feature using the following process:
+#'  1. Perform an optional term frequency + log normalization, for each feature.
+#'  2. Find `num_feats` features with the highest variance.
+#' @export
+select_features_variance <- function(
+  mat, num_feats = 0.05, 
+  normalize = NULL,
+  threads = 1L
+) {
+  assert_greater_than_zero(num_feats)
+  assert_is_wholenumber(num_feats)
+  assert_len(num_feats, 1)
+  assert_is(num_feats, "numeric")
+  if (rlang::is_missing(mat)) {
+    return(create_partial(
+      missing_args = list(
+        num_feats = missing(num_feats),
+        normalize = missing(normalize), 
+        threads = missing(threads)
+      )
+    ))
+  }
+  assert_is(mat, "IterableMatrix")
+  num_feats <- min(max(num_feats, 0), nrow(mat))
+  if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
+  features_df <- tibble::tibble(
+    names = rownames(mat),
+    score = matrix_stats(mat, row_stats = "variance", threads = threads)$row_stats["variance",]
+  ) %>% 
+    dplyr::arrange(desc(score)) %>% 
+    dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
+  return(features_df)
+}
+
+
+#' @rdname feature_selection
+#' @returns
+#'  - `select_features_by_dispersion`: Score representing the dispersion of each feature.
+#' @details 
+#' `select_features_by_dispersion` calculates the dispersion of each feature using the following process:
+#'  1. Perform an optional term frequency + log normalization, for each feature.
+#'  2. Find the dispersion (variance/mean) of each feature.
+#'  3. Find `num_feats` features with the highest dispersion.
+select_features_dispersion <- function(
+  mat, num_feats = 0.05, 
+  normalize = NULL,
+  threads = 1L
+) {
+  assert_greater_than_zero(num_feats)
+  assert_is_wholenumber(num_feats)
+  assert_len(num_feats, 1)
+  assert_is(num_feats, "numeric")
+  if (rlang::is_missing(mat)) {
+    return(create_partial(
+      missing_args = list(
+        num_feats = missing(num_feats),
+        normalize = missing(normalize), 
+        threads = missing(threads)
+      )
+    ))
+  }
+  num_feats <- min(max(num_feats, 0), nrow(mat))
+  if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
+  assert_is(mat, "IterableMatrix")
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
+  mat_stats <- matrix_stats(mat, row_stats = "variance", threads = threads)
+  features_df <- tibble::tibble(
+    names = rownames(mat),
+    score = mat_stats$row_stats["variance", ] / mat_stats$row_stats["mean", ]
+  ) %>% 
+    dplyr::arrange(desc(score)) %>% 
+    dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
+  return(features_df)
+}
+
+
+#' @rdname feature_selection
+#' @returns
+#' - `select_features_by_mean`: Score representing the mean accessibility of each feature.
+#' @details 
+#' `select_features_by_mean` calculates the mean accessibility of each feature using the following process:
+#' 1. Get the sum of each binarized feature.
+#' 2. Find `num_feats` features with the highest accessibility.
+#' @export
+select_features_mean <- function(mat, num_feats = 0.05, normalize = NULL, threads = 1L) {
+  assert_is_wholenumber(num_feats)
+  assert_greater_than_zero(num_feats)
+  assert_is(num_feats, "numeric")
+  if (rlang::is_missing(mat)) {
+    return(create_partial(
+      missing_args = list(
+        num_feats = missing(num_feats),
+        normalize = missing(normalize), 
+        threads = missing(threads)
+      )
+    ))
+  }
+  assert_is(mat, "IterableMatrix")
+  num_feats <- min(max(num_feats, 0), nrow(mat))
+  if (num_feats < 1 && num_feats > 0) num_feats <- floor(nrow(mat) * num_feats)
+  if (!is.null(normalize)) mat <- partial_apply(normalize, threads = threads)(mat)
+  # get the sum of each feature, binarized
+  # get the top features
+
+  features_df <- tibble::tibble(
+    names = rownames(mat),
+    score = matrix_stats(mat, row_stats = "nonzero", threads = threads)$row_stats["nonzero", ]
+  ) %>%
+    dplyr::arrange(desc(score)) %>%
+    dplyr::mutate(highly_variable = dplyr::row_number() <= num_feats)
+  return(features_df)
+}
+
+
 #' Test for marker features
 #'
 #' Given a features x cells matrix, perform one-vs-all differential

diff --git a/r/R/transforms.R b/r/R/transforms.R
@@ -923,3 +923,89 @@ regress_out <- function(mat, latent_data, prediction_axis = c("row", "col")) {
     vars_to_regress = vars_to_regress
   )
 }
+
+#################
+# Normalizations
+#################
+
+#' Normalization helper functions
+#' 
+#' Apply standard normalizations to a `(features x cells)` counts matrix.
+#' 
+#' @rdname normalize
+#' @param mat (IterableMatrix) Counts matrix to normalize. `(features x cells)`
+#' @param scale_factor (numeric) Scale factor to multiply matrix by for log normalization.
+#' @param threads (integer) Number of threads to use.
+#' @returns For each element \eqn{x_{ij}} in matrix \eqn{X} with \eqn{i} features and \eqn{j} cells, 
+#'  transform to a normalized value \eqn{\tilde{x}_{ij}} calculated as:
+#'  
+#'   - `normalize_log`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{colSum}_j} + 1)}
+#' @details - `normalize_log`: Corresponds to `Seurat::NormalizeLog`
+#' @export
+normalize_log <- function(mat, scale_factor = 1e4, threads = 1L) {
+  assert_is_numeric(scale_factor)
+  assert_greater_than_zero(scale_factor)
+  if (rlang::is_missing(mat)) {
+    return(
+      create_partial(
+        missing_args = list(
+          scale_factor = missing(scale_factor), 
+          threads = missing(threads)
+        )
+      )
+    ) 
+  }
+  assert_is(mat, "IterableMatrix")
+  read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
+  mat <- mat %>% multiply_cols(1 / read_depth)
+  return(log1p(mat * scale_factor))
+}
+
+
+#' @rdname normalize
+#' @param feature_means (numeric, optional) Pre-calculated means of the features to normalize by. If no names are provided, then 
+#' each numeric value is assumed to correspond to the feature mean for the corresponding row of the matrix.
+#' Else, map each feature name to its mean value.
+#' @returns - `normalize_tfidf`: \eqn{\tilde{x}_{ij} = \log(\frac{x_{ij} \cdot \text{scaleFactor}}{\text{rowMean}_i\cdot \text{colSum}_j} + 1)}
+#' @details - `normalize_tfidf`: This follows the formula from Stuart, Butler et al. 2019, used by default in `ArchR::addIterativeLSI()` and `Signac::RunTFIDF()`
+#' @export
+normalize_tfidf <- function(
+  mat, feature_means = NULL,
+  scale_factor = 1e4, threads = 1L
+) {
+  assert_is_wholenumber(threads)
+  if (rlang::is_missing(mat)) {
+    return(
+      create_partial(
+        missing_args = list(
+          feature_means = missing(feature_means), 
+          scale_factor = missing(scale_factor), 
+          threads = missing(threads)
+      )
+      )
+    )
+  }
+  assert_is(mat, "IterableMatrix")
+  # If feature means are passed in, only need to calculate term frequency
+  if (is.null(feature_means)) {
+    mat_stats <- matrix_stats(mat, row_stats = c("mean"), col_stats = c("mean"))
+    feature_means <- mat_stats$row_stats["mean", ]
+    read_depth <- mat_stats$col_stats["mean", ] * nrow(mat)
+  } else {
+    assert_is_numeric(feature_means)
+    if (!is.null(names(feature_means)) && !is.null(rownames(mat))) {
+      # Make sure every name in feature means exists in rownames(mat)
+      # In the case there is a length mismatch but the feature names all exist in feature_means
+      # will not error out
+      assert_true(all(rownames(mat) %in% names(feature_means)))
+      feature_means <- feature_means[rownames(mat)]
+    } else {
+      assert_len(feature_means, nrow(mat))
+    }
+    read_depth <- matrix_stats(mat, col_stats = c("mean"), threads = threads)$col_stats["mean", ] * nrow(mat)
+  }
+  tf <- mat %>% multiply_cols(1 / read_depth)
+  idf <- 1 / feature_means
+  tf_idf_mat <- tf %>% multiply_rows(idf)
+  return(log1p(tf_idf_mat * scale_factor))
+}
diff --git a/r/R/utils.R b/r/R/utils.R
@@ -56,4 +56,69 @@ log_progress <- function(msg, add_timestamp = TRUE){
   } else {
     message(msg)
   }
+}
+
+#' Helper to create partial functions
+#'
+#' Automatically creates a partial application of the caller
+#' function including all non-missing arguments. 
+#'
+#' @param missing_args (named list[bool]) Any named index with a TRUE value
+#'   will be treated as missing. Designed to be used in the caller with the
+#'   `base::missing()` function to detect unspecified arguments with default values,
+#'   or to manually specifiy other arguments that should not be specialized
+#' @return A `bpcells_partial` object (a function with some extra attributes)
+#' @keywords internal
+create_partial <- function(missing_args=list()) {
+  env <- rlang::caller_env()
+  fn_sym <- rlang::caller_call()[[1]]
+  fn <- rlang::caller_fn()
+
+  args <- list()
+  for (n in names(formals(fn))) {
+    if (n %in% names(missing_args) && missing_args[[n]]) next
+    if (rlang::is_missing(env[[n]])) next
+    args[[n]] <- env[[n]]
+  }
+
+  ret <- do.call(partial_apply, c(fn, args))
+  attr(ret, "body")[[1]] <- fn_sym
+  return(ret)
+}
+
+
+#' Create partial function calls
+#' 
+#' Specify some but not all arguments to a function.
+#'
+#' @param f A function
+#' @param ... Named arguments to `f`
+#' @param .overwrite (bool) If `f` is already an output from
+#'   `partial_apply()`, whether parameter re-definitions should
+#'   be ignored or overwrite the existing definitions
+#' @return A `bpcells_partial` object (a function with some extra attributes)
+#' @keywords internal
+partial_apply <- function(f, ..., .overwrite=FALSE) {
+  args <- rlang::list2(...)
+
+  if (is(f, "bpcells_partial")) {
+    prev_args <- attr(f, "args")
+    for (a in names(prev_args)) {
+      if (!(.overwrite && a %in% names(args))) {
+        args[[a]] <- prev_args[[a]]
+      } 
+    }
+    f <- attr(f, "fn")
+    function_name <- attr(f, "body")[[1]]
+  } else {
+    function_name <- rlang::sym(rlang::caller_arg(f))
+  }
+  partial_fn <- do.call(purrr::partial, c(f, args))
+  attr(partial_fn, "body")[[1]] <- function_name
+  structure(
+    partial_fn,
+    class = c("bpcells_partial", "purrr_function_partial", "function"),
+    args = args,
+    fn = f
+  )
 }
diff --git a/r/man/create_partial.Rd b/r/man/create_partial.Rd