Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add AnnData dense matrix read support #146

Merged
merged 11 commits into from
Nov 25, 2024
29 changes: 23 additions & 6 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,20 @@ target_include_directories(
${SRC_INCLUDE}
)

add_library(
matrixIterators
${SRC}/simd/dense-multiply.cpp
)
target_link_libraries(
matrixIterators
arrayIO
)
target_include_directories(
matrixIterators
PUBLIC
${SRC_INCLUDE}
)

add_library(
fragmentIterators
${SRC}/fragmentIterators/BedFragments.cpp
Expand Down Expand Up @@ -178,19 +192,22 @@ gtest_discover_tests(test-fragmentUtils)
add_executable(
test-matrixIO
tests/test-matrixIO.cpp
${SRC}/matrixIterators/ImportMatrixHDF5.cpp
${SRC}/simd/dense-multiply.cpp
)
target_link_libraries(test-matrixIO arrayIO Eigen3::Eigen gtest_main gmock)
target_link_libraries(test-matrixIO matrixIterators Eigen3::Eigen gtest_main gmock)
gtest_discover_tests(test-matrixIO)

add_executable(
test-matrixIterators
tests/test-matrixIterators.cpp
)
target_link_libraries(test-matrixIterators matrixIterators Eigen3::Eigen gtest_main gmock)
gtest_discover_tests(test-matrixIterators)

add_executable(
test-matrixTranspose
tests/test-matrixTranspose.cpp
${SRC}/matrixIterators/ImportMatrixHDF5.cpp
${SRC}/simd/dense-multiply.cpp
)
target_link_libraries(test-matrixTranspose arrayIO Eigen3::Eigen gtest_main)
target_link_libraries(test-matrixTranspose matrixIterators Eigen3::Eigen gtest_main)
gtest_discover_tests(test-matrixTranspose)

# Build matrix math test for all available SIMD backends
Expand Down
2 changes: 0 additions & 2 deletions cpp/tests/test-matrixIO.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@
#include <matrixIterators/StoredMatrix.h>
#include <matrixIterators/StoredMatrixWriter.h>

#include <matrixIterators/ImportMatrixHDF5.h>

#include <Eigen/Core>

using namespace BPCells;
Expand Down
94 changes: 94 additions & 0 deletions cpp/tests/test-matrixIterators.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// Copyright 2024 BPCells contributors
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

#include <random>
#include <sstream>

#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include <matrixIterators/CSparseMatrix.h>
#include <matrixIterators/FilterZeros.h>

#include <Eigen/Core>

using namespace BPCells;
using namespace ::testing;
using namespace Eigen;

SparseMatrix<double> generate_mat(uint32_t n_row, uint32_t n_col, uint32_t seed = 125124) {
std::mt19937 gen(seed); // Standard mersenne_twister_engine
std::uniform_int_distribution<> distrib(1, 20);
std::uniform_int_distribution<> nonzero(0, 4); // 1/5 chance of being non-zero

std::vector<Triplet<double>> triplets;

for (int i = 0; i < n_row; i++) {
for (int j = 0; j < n_col; j++) {
if (0 == nonzero(gen)) {
triplets.push_back({i, j, (double) distrib(gen)});
}
}
}

SparseMatrix<double> mat(n_row, n_col);
mat.setFromTriplets(triplets.begin(), triplets.end());
return mat;
}

Map<SparseMatrix<double>> get_map(SparseMatrix<double> &mat) {
return Map<SparseMatrix<double>>(
mat.rows(),
mat.cols(),
mat.nonZeros(),
(int *)mat.outerIndexPtr(),
(int *)mat.innerIndexPtr(),
(double *)mat.valuePtr()
);
}

bool matrices_are_identical(SparseMatrix<double> a, SparseMatrix<double> b) {
if (a.rows() != b.rows()) return false;
if (a.cols() != b.cols()) return false;
EXPECT_TRUE(a.isCompressed());
EXPECT_TRUE(b.isCompressed());
if (a.nonZeros() != b.nonZeros()) return false;
for (size_t i = 0; i < a.nonZeros(); i++) {
if (a.innerIndexPtr()[i] != b.innerIndexPtr()[i]) return false;
if (a.valuePtr()[i] != b.valuePtr()[i]) return false;
}
if (a.outerSize() != b.outerSize()) return false;
for (size_t i = 0; i < a.outerSize(); i++) {
if (a.outerIndexPtr()[i] != b.outerIndexPtr()[i]) return false;
}
return true;
}

TEST(MatrixIterators, FilterZeros) {
// A matrix without any explicit zeros should not be transformed
SparseMatrix<double> m1 = generate_mat(100, 50, 125123);
FilterZeros<double> m1_filt(std::make_unique<CSparseMatrix<double>>(get_map(m1), std::unique_ptr<StringReader>(), std::unique_ptr<StringReader>(), 5));

CSparseMatrixWriter<double> res_m1;
res_m1.write(m1_filt);
EXPECT_TRUE(matrices_are_identical(m1, res_m1.getMat()));

// Introduce explicit zeros into m2
SparseMatrix<double> m2 = m1;
for (auto &x : m2.coeffs()) {
if (x < 10) x = 0;
}
SparseMatrix<double> m2_pruned = m2.pruned();

// The explicit zeros should be filtered out
FilterZeros<double> m2_filt(std::make_unique<CSparseMatrix<double>>(get_map(m2), std::unique_ptr<StringReader>(), std::unique_ptr<StringReader>(), 5));
CSparseMatrixWriter<double> res_m2;
res_m2.write(m2_filt);
EXPECT_FALSE(matrices_are_identical(m2, res_m2.getMat()));
EXPECT_TRUE(matrices_are_identical(m2_pruned, res_m2.getMat()));
}
2 changes: 0 additions & 2 deletions cpp/tests/test-matrixTranspose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@

#include <gtest/gtest.h>

#include <arrayIO/vector.h>
#include <matrixIterators/CSparseMatrix.h>
#include <matrixIterators/ImportMatrixHDF5.h>
#include <matrixIterators/StoredMatrixTransposeWriter.h>
#include <matrixIterators/StoredMatrixWriter.h>

Expand Down
1 change: 1 addition & 0 deletions r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ Contributions welcome :)
- `trackplot_combine()` now has smarter layout logic for margins, as well as detecting when plots are being combined that cover different genomic regions. (pull request #116)
- `select_cells()` and `select_chromosomes()` now also allow using a logical mask for selection. (pull request #117)
- BPCells installation can now also be configured by setting the `LDFLAGS` or `CFLAGS` as environment variables in addition to setting them in `~/.R/Makevars` (pull request #124)
- `open_matrix_anndata_hdf5()` now supports reading AnnData matrices in the dense format. (pull request #146)
- `cluster_graph_leiden()` now has better defaults that produce reasonable cluster counts regardless of dataset size. (pull request #147)

## Bug-fixes
Expand Down
10 changes: 9 additions & 1 deletion r/R/matrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -2132,14 +2132,22 @@ setMethod("short_description", "AnnDataMatrixH5", function(x) {

#' Read/write AnnData matrix
#'
#' Read or write a sparse matrix from an anndata hdf5 file. These functions will
#' Read or write a matrix from an anndata hdf5 file. These functions will
#' automatically transpose matrices when converting to/from the AnnData
#' format. This is because the AnnData convention stores cells as rows, whereas the R
#' convention stores cells as columns. If this behavior is undesired, call `t()`
#' manually on the matrix inputs and outputs of these functions.
#'
#' @inheritParams open_matrix_hdf5
#' @return AnnDataMatrixH5 object, with cells as the columns.
#' @details Dimnames are inferred from `obs/_index` or `var/_index` based on length matching.
#' This helps to infer dimnames for `obsp`,` varm`, etc. If the number of `len(obs) == len(var)`,
#' dimname inference will be disabled.
#'
#' *Efficiency considerations*: Reading from a dense AnnData matrix will generally be slower
#' than sparse for single cell datasets, so it is recommended to re-write any dense AnnData
#' inputs to a sparse format early in processing. Note that `write_matrix_hdf5()` will only
#' write in the sparse format.
#' @export
open_matrix_anndata_hdf5 <- function(path, group = "X", buffer_size = 16384L) {
assert_is_file(path)
Expand Down
12 changes: 11 additions & 1 deletion r/man/open_matrix_anndata_hdf5.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion r/src/Makevars.in
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ bpcells-cpp/fragmentIterators/StoredFragments.o \
bpcells-cpp/fragmentUtils/BedWriter.o \
bpcells-cpp/fragmentUtils/FootprintMatrix.o \
bpcells-cpp/fragmentUtils/InsertionIterator.o \
bpcells-cpp/matrixIterators/ImportMatrixHDF5.o \
bpcells-cpp/matrixIterators/ImportMatrix10xHDF5.o \
bpcells-cpp/matrixIterators/ImportMatrixAnnDataHDF5.o \
bpcells-cpp/matrixIterators/PeakMatrix.o \
bpcells-cpp/matrixIterators/TileMatrix.o \
bpcells-cpp/matrixIterators/MatrixStats.o \
Expand Down
6 changes: 6 additions & 0 deletions r/src/R_xptr_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ SEXP make_unique_xptr(Args&&... args) {
return Rcpp::wrap(Rcpp::XPtr<std::unique_ptr<T>>(new std::unique_ptr<T>(new T(std::forward<Args>(args)...))));
}

// Construct the XPtr wrapper if we already have a unique_ptr object
template<class T>
SEXP unique_xptr(std::unique_ptr<T> &&ptr) {
return Rcpp::wrap(Rcpp::XPtr<std::unique_ptr<T>>(new std::unique_ptr<T>(std::move(ptr))));
}

// Take ownership of the unique_ptr from the XPtr
template<class T>
std::unique_ptr<T> take_unique_xptr(SEXP &sexp) {
Expand Down
50 changes: 50 additions & 0 deletions r/src/bpcells-cpp/matrixIterators/FilterZeros.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Copyright 2024 BPCells contributors
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

#include <atomic>

#include "../arrayIO/array_interfaces.h"
#include "MatrixIterator.h"

namespace BPCells {

// Filter out zero values from a MatrixLoader
// This is useful when reading dense matrices that have many zero values,
// or when performing operations that will cause new zeros to be created (e.g. multiplying a row by zero)
template<typename T>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very cool and elegant! Can already see this being used in a handful of derived matrixloader types

class FilterZeros : public MatrixLoaderWrapper<T> {
private:
size_t capacity_ = 0;
public:
FilterZeros(std::unique_ptr<MatrixLoader<T>> &&loader) : MatrixLoaderWrapper<T>(std::move(loader)) {}

// Return false if there are no more entries to load
bool load() override {
capacity_ = 0;

while (capacity_ == 0) {
if (!this->loader->load()) return false;

uint32_t *row_data = this->loader->rowData();
T* val_data = this->loader->valData();
size_t cap = this->loader->capacity();

for (size_t i = 0; i < cap; i++) {
row_data[capacity_] = row_data[i];
val_data[capacity_] = val_data[i];
capacity_ += val_data[capacity_] != 0;
}
}
return true;
}

// Number of loaded entries available
uint32_t capacity() const override {return capacity_;}
};

} // end namespace BPCells
Loading
Loading