bnprks · bnprks · Nov 25, 2024 · Oct 22, 2024 · Oct 23, 2024 · Oct 23, 2024
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -101,6 +101,20 @@ target_include_directories(
     ${SRC_INCLUDE}
 )
 
+add_library(
+    matrixIterators
+    ${SRC}/simd/dense-multiply.cpp
+)
+target_link_libraries(
+    matrixIterators
+    arrayIO
+)
+target_include_directories(
+    matrixIterators 
+    PUBLIC 
+    ${SRC_INCLUDE}
+)
+
 add_library(
     fragmentIterators
     ${SRC}/fragmentIterators/BedFragments.cpp
@@ -178,19 +192,22 @@ gtest_discover_tests(test-fragmentUtils)
 add_executable(
     test-matrixIO
     tests/test-matrixIO.cpp
-    ${SRC}/matrixIterators/ImportMatrixHDF5.cpp
-    ${SRC}/simd/dense-multiply.cpp
 )
-target_link_libraries(test-matrixIO arrayIO Eigen3::Eigen gtest_main gmock)
+target_link_libraries(test-matrixIO matrixIterators Eigen3::Eigen gtest_main gmock)
 gtest_discover_tests(test-matrixIO)
 
+add_executable(
+    test-matrixIterators
+    tests/test-matrixIterators.cpp
+)
+target_link_libraries(test-matrixIterators matrixIterators Eigen3::Eigen gtest_main gmock)
+gtest_discover_tests(test-matrixIterators)
+
 add_executable(
     test-matrixTranspose
     tests/test-matrixTranspose.cpp
-    ${SRC}/matrixIterators/ImportMatrixHDF5.cpp
-    ${SRC}/simd/dense-multiply.cpp
 )
-target_link_libraries(test-matrixTranspose arrayIO Eigen3::Eigen gtest_main)
+target_link_libraries(test-matrixTranspose matrixIterators Eigen3::Eigen gtest_main)
 gtest_discover_tests(test-matrixTranspose)
 
 # Build matrix math test for all available SIMD backends

diff --git a/cpp/tests/test-matrixIO.cpp b/cpp/tests/test-matrixIO.cpp
@@ -20,8 +20,6 @@
 #include <matrixIterators/StoredMatrix.h>
 #include <matrixIterators/StoredMatrixWriter.h>
 
-#include <matrixIterators/ImportMatrixHDF5.h>
-
 #include <Eigen/Core>
 
 using namespace BPCells;

diff --git a/cpp/tests/test-matrixIterators.cpp b/cpp/tests/test-matrixIterators.cpp
@@ -0,0 +1,94 @@
+// Copyright 2024 BPCells contributors
+// 
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#include <random>
+#include <sstream>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include <matrixIterators/CSparseMatrix.h>
+#include <matrixIterators/FilterZeros.h>
+
+#include <Eigen/Core>
+
+using namespace BPCells;
+using namespace ::testing;
+using namespace Eigen;
+
+SparseMatrix<double> generate_mat(uint32_t n_row, uint32_t n_col, uint32_t seed = 125124) {
+    std::mt19937 gen(seed); // Standard mersenne_twister_engine
+    std::uniform_int_distribution<> distrib(1, 20);
+    std::uniform_int_distribution<> nonzero(0, 4); // 1/5 chance of being non-zero
+
+    std::vector<Triplet<double>> triplets;
+
+    for (int i = 0; i < n_row; i++) {
+        for (int j = 0; j < n_col; j++) {
+            if (0 == nonzero(gen)) {
+                triplets.push_back({i, j, (double) distrib(gen)});
+            }
+        }
+    }
+
+    SparseMatrix<double> mat(n_row, n_col);
+    mat.setFromTriplets(triplets.begin(), triplets.end());
+    return mat;
+}
+
+Map<SparseMatrix<double>> get_map(SparseMatrix<double> &mat) {
+    return Map<SparseMatrix<double>>(
+        mat.rows(),
+        mat.cols(),
+        mat.nonZeros(),
+        (int *)mat.outerIndexPtr(),
+        (int *)mat.innerIndexPtr(),
+        (double *)mat.valuePtr()
+    );
+}
+
+bool matrices_are_identical(SparseMatrix<double> a, SparseMatrix<double> b) {
+    if (a.rows() != b.rows()) return false;
+    if (a.cols() != b.cols()) return false;
+    EXPECT_TRUE(a.isCompressed());
+    EXPECT_TRUE(b.isCompressed());
+    if (a.nonZeros() != b.nonZeros()) return false;
+    for (size_t i = 0; i < a.nonZeros(); i++) {
+        if (a.innerIndexPtr()[i] != b.innerIndexPtr()[i]) return false;
+        if (a.valuePtr()[i] != b.valuePtr()[i]) return false;
+    }
+    if (a.outerSize() != b.outerSize()) return false;
+    for (size_t i = 0; i < a.outerSize(); i++) {
+        if (a.outerIndexPtr()[i] != b.outerIndexPtr()[i]) return false;
+    } 
+    return true;
+}
+
+TEST(MatrixIterators, FilterZeros) {
+    // A matrix without any explicit zeros should not be transformed
+    SparseMatrix<double> m1 = generate_mat(100, 50, 125123);
+    FilterZeros<double> m1_filt(std::make_unique<CSparseMatrix<double>>(get_map(m1), std::unique_ptr<StringReader>(), std::unique_ptr<StringReader>(), 5));
+
+    CSparseMatrixWriter<double> res_m1;
+    res_m1.write(m1_filt);
+    EXPECT_TRUE(matrices_are_identical(m1, res_m1.getMat()));
+
+    // Introduce explicit zeros into m2
+    SparseMatrix<double> m2 = m1;
+    for (auto &x : m2.coeffs()) {
+        if (x < 10) x = 0;
+    }
+    SparseMatrix<double> m2_pruned = m2.pruned();
+
+    // The explicit zeros should be filtered out
+    FilterZeros<double> m2_filt(std::make_unique<CSparseMatrix<double>>(get_map(m2), std::unique_ptr<StringReader>(), std::unique_ptr<StringReader>(), 5));
+    CSparseMatrixWriter<double> res_m2;
+    res_m2.write(m2_filt);
+    EXPECT_FALSE(matrices_are_identical(m2, res_m2.getMat()));
+    EXPECT_TRUE(matrices_are_identical(m2_pruned, res_m2.getMat()));
+}
diff --git a/cpp/tests/test-matrixTranspose.cpp b/cpp/tests/test-matrixTranspose.cpp
@@ -11,9 +11,7 @@
 
 #include <gtest/gtest.h>
 
-#include <arrayIO/vector.h>
 #include <matrixIterators/CSparseMatrix.h>
-#include <matrixIterators/ImportMatrixHDF5.h>
 #include <matrixIterators/StoredMatrixTransposeWriter.h>
 #include <matrixIterators/StoredMatrixWriter.h>
 

diff --git a/r/NEWS.md b/r/NEWS.md
@@ -28,6 +28,7 @@ Contributions welcome :)
 - `trackplot_combine()` now has smarter layout logic for margins, as well as detecting when plots are being combined that cover different genomic regions. (pull request #116)
 - `select_cells()` and `select_chromosomes()` now also allow using a logical mask for selection. (pull request #117)
 - BPCells installation can now also be configured by setting the `LDFLAGS` or `CFLAGS` as environment variables in addition to setting them in `~/.R/Makevars` (pull request #124)
+- `open_matrix_anndata_hdf5()` now supports reading AnnData matrices in the dense format. (pull request #146)
 - `cluster_graph_leiden()` now has better defaults that produce reasonable cluster counts regardless of dataset size. (pull request #147) 
 
 ## Bug-fixes

diff --git a/r/R/matrix.R b/r/R/matrix.R
@@ -2132,14 +2132,22 @@ setMethod("short_description", "AnnDataMatrixH5", function(x) {
 
 #' Read/write AnnData matrix
 #'
-#' Read or write a sparse matrix from an anndata hdf5 file. These functions will
+#' Read or write a matrix from an anndata hdf5 file. These functions will
 #' automatically transpose matrices when converting to/from the AnnData
 #' format. This is because the AnnData convention stores cells as rows, whereas the R
 #' convention stores cells as columns. If this behavior is undesired, call `t()`
 #' manually on the matrix inputs and outputs of these functions. 
 #' 
 #' @inheritParams open_matrix_hdf5
 #' @return AnnDataMatrixH5 object, with cells as the columns.
+#' @details Dimnames are inferred from `obs/_index` or `var/_index` based on length matching.
+#'   This helps to infer dimnames for `obsp`,` varm`, etc. If the number of `len(obs) == len(var)`,
+#'   dimname inference will be disabled.
+#'
+#'   *Efficiency considerations*: Reading from a dense AnnData matrix will generally be slower
+#'   than sparse for single cell datasets, so it is recommended to re-write any dense AnnData
+#'   inputs to a sparse format early in processing. Note that `write_matrix_hdf5()` will only
+#'   write in the sparse format.
 #' @export
 open_matrix_anndata_hdf5 <- function(path, group = "X", buffer_size = 16384L) {
   assert_is_file(path)

diff --git a/r/man/open_matrix_anndata_hdf5.Rd b/r/man/open_matrix_anndata_hdf5.Rd
diff --git a/r/src/Makevars.in b/r/src/Makevars.in
@@ -35,7 +35,8 @@ bpcells-cpp/fragmentIterators/StoredFragments.o \
 bpcells-cpp/fragmentUtils/BedWriter.o \
 bpcells-cpp/fragmentUtils/FootprintMatrix.o \
 bpcells-cpp/fragmentUtils/InsertionIterator.o \
-bpcells-cpp/matrixIterators/ImportMatrixHDF5.o \
+bpcells-cpp/matrixIterators/ImportMatrix10xHDF5.o \
+bpcells-cpp/matrixIterators/ImportMatrixAnnDataHDF5.o \
 bpcells-cpp/matrixIterators/PeakMatrix.o \
 bpcells-cpp/matrixIterators/TileMatrix.o \
 bpcells-cpp/matrixIterators/MatrixStats.o \

diff --git a/r/src/R_xptr_wrapper.h b/r/src/R_xptr_wrapper.h
@@ -33,6 +33,12 @@ SEXP make_unique_xptr(Args&&... args) {
   return Rcpp::wrap(Rcpp::XPtr<std::unique_ptr<T>>(new std::unique_ptr<T>(new T(std::forward<Args>(args)...))));
 }
 
+// Construct the XPtr wrapper if we already have a unique_ptr object
+template<class T>
+SEXP unique_xptr(std::unique_ptr<T> &&ptr) {
+  return Rcpp::wrap(Rcpp::XPtr<std::unique_ptr<T>>(new std::unique_ptr<T>(std::move(ptr))));
+}
+
 // Take ownership of the unique_ptr from the XPtr
 template<class T>
 std::unique_ptr<T> take_unique_xptr(SEXP &sexp) {

diff --git a/r/src/bpcells-cpp/matrixIterators/FilterZeros.h b/r/src/bpcells-cpp/matrixIterators/FilterZeros.h
@@ -0,0 +1,50 @@
+// Copyright 2024 BPCells contributors
+// 
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+#include <atomic>
+
+#include "../arrayIO/array_interfaces.h"
+#include "MatrixIterator.h"
+
+namespace BPCells {
+
+// Filter out zero values from a MatrixLoader
+// This is useful when reading dense matrices that have many zero values,
+// or when performing operations that will cause new zeros to be created (e.g. multiplying a row by zero)
+template<typename T>
+class FilterZeros : public MatrixLoaderWrapper<T> {
+  private:
+    size_t capacity_ = 0;
+  public:
+    FilterZeros(std::unique_ptr<MatrixLoader<T>> &&loader) : MatrixLoaderWrapper<T>(std::move(loader)) {}
+
+    // Return false if there are no more entries to load
+    bool load() override {
+        capacity_ = 0;
+
+        while (capacity_ == 0) {
+            if (!this->loader->load()) return false;
+
+            uint32_t *row_data = this->loader->rowData();
+            T* val_data = this->loader->valData();
+            size_t cap = this->loader->capacity();
+
+            for (size_t i = 0; i < cap; i++) {
+                row_data[capacity_] = row_data[i];
+                val_data[capacity_] = val_data[i];
+                capacity_ += val_data[capacity_] != 0;
+            }
+        }
+        return true;
+    }
+
+    // Number of loaded entries available
+    uint32_t capacity() const override {return capacity_;}
+};
+
+} // end namespace BPCells