diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 16040bc9..4147de5f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -101,6 +101,20 @@ target_include_directories( ${SRC_INCLUDE} ) +add_library( + matrixIterators + ${SRC}/simd/dense-multiply.cpp +) +target_link_libraries( + matrixIterators + arrayIO +) +target_include_directories( + matrixIterators + PUBLIC + ${SRC_INCLUDE} +) + add_library( fragmentIterators ${SRC}/fragmentIterators/BedFragments.cpp @@ -178,19 +192,22 @@ gtest_discover_tests(test-fragmentUtils) add_executable( test-matrixIO tests/test-matrixIO.cpp - ${SRC}/matrixIterators/ImportMatrixHDF5.cpp - ${SRC}/simd/dense-multiply.cpp ) -target_link_libraries(test-matrixIO arrayIO Eigen3::Eigen gtest_main gmock) +target_link_libraries(test-matrixIO matrixIterators Eigen3::Eigen gtest_main gmock) gtest_discover_tests(test-matrixIO) +add_executable( + test-matrixIterators + tests/test-matrixIterators.cpp +) +target_link_libraries(test-matrixIterators matrixIterators Eigen3::Eigen gtest_main gmock) +gtest_discover_tests(test-matrixIterators) + add_executable( test-matrixTranspose tests/test-matrixTranspose.cpp - ${SRC}/matrixIterators/ImportMatrixHDF5.cpp - ${SRC}/simd/dense-multiply.cpp ) -target_link_libraries(test-matrixTranspose arrayIO Eigen3::Eigen gtest_main) +target_link_libraries(test-matrixTranspose matrixIterators Eigen3::Eigen gtest_main) gtest_discover_tests(test-matrixTranspose) # Build matrix math test for all available SIMD backends diff --git a/cpp/tests/test-matrixIO.cpp b/cpp/tests/test-matrixIO.cpp index 478f2449..a967fad3 100644 --- a/cpp/tests/test-matrixIO.cpp +++ b/cpp/tests/test-matrixIO.cpp @@ -20,8 +20,6 @@ #include #include -#include - #include using namespace BPCells; diff --git a/cpp/tests/test-matrixIterators.cpp b/cpp/tests/test-matrixIterators.cpp new file mode 100644 index 00000000..15132416 --- /dev/null +++ b/cpp/tests/test-matrixIterators.cpp @@ -0,0 +1,94 @@ +// Copyright 2024 BPCells contributors +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#include +#include + +#include +#include + +#include +#include + +#include + +using namespace BPCells; +using namespace ::testing; +using namespace Eigen; + +SparseMatrix generate_mat(uint32_t n_row, uint32_t n_col, uint32_t seed = 125124) { + std::mt19937 gen(seed); // Standard mersenne_twister_engine + std::uniform_int_distribution<> distrib(1, 20); + std::uniform_int_distribution<> nonzero(0, 4); // 1/5 chance of being non-zero + + std::vector> triplets; + + for (int i = 0; i < n_row; i++) { + for (int j = 0; j < n_col; j++) { + if (0 == nonzero(gen)) { + triplets.push_back({i, j, (double) distrib(gen)}); + } + } + } + + SparseMatrix mat(n_row, n_col); + mat.setFromTriplets(triplets.begin(), triplets.end()); + return mat; +} + +Map> get_map(SparseMatrix &mat) { + return Map>( + mat.rows(), + mat.cols(), + mat.nonZeros(), + (int *)mat.outerIndexPtr(), + (int *)mat.innerIndexPtr(), + (double *)mat.valuePtr() + ); +} + +bool matrices_are_identical(SparseMatrix a, SparseMatrix b) { + if (a.rows() != b.rows()) return false; + if (a.cols() != b.cols()) return false; + EXPECT_TRUE(a.isCompressed()); + EXPECT_TRUE(b.isCompressed()); + if (a.nonZeros() != b.nonZeros()) return false; + for (size_t i = 0; i < a.nonZeros(); i++) { + if (a.innerIndexPtr()[i] != b.innerIndexPtr()[i]) return false; + if (a.valuePtr()[i] != b.valuePtr()[i]) return false; + } + if (a.outerSize() != b.outerSize()) return false; + for (size_t i = 0; i < a.outerSize(); i++) { + if (a.outerIndexPtr()[i] != b.outerIndexPtr()[i]) return false; + } + return true; +} + +TEST(MatrixIterators, FilterZeros) { + // A matrix without any explicit zeros should not be transformed + SparseMatrix m1 = generate_mat(100, 50, 125123); + FilterZeros m1_filt(std::make_unique>(get_map(m1), std::unique_ptr(), std::unique_ptr(), 5)); + + CSparseMatrixWriter res_m1; + res_m1.write(m1_filt); + EXPECT_TRUE(matrices_are_identical(m1, res_m1.getMat())); + + // Introduce explicit zeros into m2 + SparseMatrix m2 = m1; + for (auto &x : m2.coeffs()) { + if (x < 10) x = 0; + } + SparseMatrix m2_pruned = m2.pruned(); + + // The explicit zeros should be filtered out + FilterZeros m2_filt(std::make_unique>(get_map(m2), std::unique_ptr(), std::unique_ptr(), 5)); + CSparseMatrixWriter res_m2; + res_m2.write(m2_filt); + EXPECT_FALSE(matrices_are_identical(m2, res_m2.getMat())); + EXPECT_TRUE(matrices_are_identical(m2_pruned, res_m2.getMat())); +} diff --git a/cpp/tests/test-matrixTranspose.cpp b/cpp/tests/test-matrixTranspose.cpp index bd54bf01..1fc627ab 100644 --- a/cpp/tests/test-matrixTranspose.cpp +++ b/cpp/tests/test-matrixTranspose.cpp @@ -11,9 +11,7 @@ #include -#include #include -#include #include #include diff --git a/r/NEWS.md b/r/NEWS.md index f14ddac8..9715c95a 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -28,6 +28,7 @@ Contributions welcome :) - `trackplot_combine()` now has smarter layout logic for margins, as well as detecting when plots are being combined that cover different genomic regions. (pull request #116) - `select_cells()` and `select_chromosomes()` now also allow using a logical mask for selection. (pull request #117) - BPCells installation can now also be configured by setting the `LDFLAGS` or `CFLAGS` as environment variables in addition to setting them in `~/.R/Makevars` (pull request #124) +- `open_matrix_anndata_hdf5()` now supports reading AnnData matrices in the dense format. (pull request #146) - `cluster_graph_leiden()` now has better defaults that produce reasonable cluster counts regardless of dataset size. (pull request #147) ## Bug-fixes diff --git a/r/R/matrix.R b/r/R/matrix.R index fbbeb3b9..66e391c1 100644 --- a/r/R/matrix.R +++ b/r/R/matrix.R @@ -2132,7 +2132,7 @@ setMethod("short_description", "AnnDataMatrixH5", function(x) { #' Read/write AnnData matrix #' -#' Read or write a sparse matrix from an anndata hdf5 file. These functions will +#' Read or write a matrix from an anndata hdf5 file. These functions will #' automatically transpose matrices when converting to/from the AnnData #' format. This is because the AnnData convention stores cells as rows, whereas the R #' convention stores cells as columns. If this behavior is undesired, call `t()` @@ -2140,6 +2140,14 @@ setMethod("short_description", "AnnDataMatrixH5", function(x) { #' #' @inheritParams open_matrix_hdf5 #' @return AnnDataMatrixH5 object, with cells as the columns. +#' @details Dimnames are inferred from `obs/_index` or `var/_index` based on length matching. +#' This helps to infer dimnames for `obsp`,` varm`, etc. If the number of `len(obs) == len(var)`, +#' dimname inference will be disabled. +#' +#' *Efficiency considerations*: Reading from a dense AnnData matrix will generally be slower +#' than sparse for single cell datasets, so it is recommended to re-write any dense AnnData +#' inputs to a sparse format early in processing. Note that `write_matrix_hdf5()` will only +#' write in the sparse format. #' @export open_matrix_anndata_hdf5 <- function(path, group = "X", buffer_size = 16384L) { assert_is_file(path) diff --git a/r/man/open_matrix_anndata_hdf5.Rd b/r/man/open_matrix_anndata_hdf5.Rd index 85c64ea9..8a8759a1 100644 --- a/r/man/open_matrix_anndata_hdf5.Rd +++ b/r/man/open_matrix_anndata_hdf5.Rd @@ -33,9 +33,19 @@ in memory before calling writes to disk.} AnnDataMatrixH5 object, with cells as the columns. } \description{ -Read or write a sparse matrix from an anndata hdf5 file. These functions will +Read or write a matrix from an anndata hdf5 file. These functions will automatically transpose matrices when converting to/from the AnnData format. This is because the AnnData convention stores cells as rows, whereas the R convention stores cells as columns. If this behavior is undesired, call \code{t()} manually on the matrix inputs and outputs of these functions. } +\details{ +Dimnames are inferred from \verb{obs/_index} or \verb{var/_index} based on length matching. +This helps to infer dimnames for \code{obsp},\code{ varm}, etc. If the number of \code{len(obs) == len(var)}, +dimname inference will be disabled. + +\emph{Efficiency considerations}: Reading from a dense AnnData matrix will generally be slower +than sparse for single cell datasets, so it is recommended to re-write any dense AnnData +inputs to a sparse format early in processing. Note that \code{write_matrix_hdf5()} will only +write in the sparse format. +} diff --git a/r/src/Makevars.in b/r/src/Makevars.in index 14470e0c..a3d987f4 100644 --- a/r/src/Makevars.in +++ b/r/src/Makevars.in @@ -35,7 +35,8 @@ bpcells-cpp/fragmentIterators/StoredFragments.o \ bpcells-cpp/fragmentUtils/BedWriter.o \ bpcells-cpp/fragmentUtils/FootprintMatrix.o \ bpcells-cpp/fragmentUtils/InsertionIterator.o \ -bpcells-cpp/matrixIterators/ImportMatrixHDF5.o \ +bpcells-cpp/matrixIterators/ImportMatrix10xHDF5.o \ +bpcells-cpp/matrixIterators/ImportMatrixAnnDataHDF5.o \ bpcells-cpp/matrixIterators/PeakMatrix.o \ bpcells-cpp/matrixIterators/TileMatrix.o \ bpcells-cpp/matrixIterators/MatrixStats.o \ diff --git a/r/src/R_xptr_wrapper.h b/r/src/R_xptr_wrapper.h index 9ec1eca2..d6f4a3dd 100644 --- a/r/src/R_xptr_wrapper.h +++ b/r/src/R_xptr_wrapper.h @@ -33,6 +33,12 @@ SEXP make_unique_xptr(Args&&... args) { return Rcpp::wrap(Rcpp::XPtr>(new std::unique_ptr(new T(std::forward(args)...)))); } +// Construct the XPtr wrapper if we already have a unique_ptr object +template +SEXP unique_xptr(std::unique_ptr &&ptr) { + return Rcpp::wrap(Rcpp::XPtr>(new std::unique_ptr(std::move(ptr)))); +} + // Take ownership of the unique_ptr from the XPtr template std::unique_ptr take_unique_xptr(SEXP &sexp) { diff --git a/r/src/bpcells-cpp/matrixIterators/FilterZeros.h b/r/src/bpcells-cpp/matrixIterators/FilterZeros.h new file mode 100644 index 00000000..ec859fea --- /dev/null +++ b/r/src/bpcells-cpp/matrixIterators/FilterZeros.h @@ -0,0 +1,50 @@ +// Copyright 2024 BPCells contributors +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#include + +#include "../arrayIO/array_interfaces.h" +#include "MatrixIterator.h" + +namespace BPCells { + +// Filter out zero values from a MatrixLoader +// This is useful when reading dense matrices that have many zero values, +// or when performing operations that will cause new zeros to be created (e.g. multiplying a row by zero) +template +class FilterZeros : public MatrixLoaderWrapper { + private: + size_t capacity_ = 0; + public: + FilterZeros(std::unique_ptr> &&loader) : MatrixLoaderWrapper(std::move(loader)) {} + + // Return false if there are no more entries to load + bool load() override { + capacity_ = 0; + + while (capacity_ == 0) { + if (!this->loader->load()) return false; + + uint32_t *row_data = this->loader->rowData(); + T* val_data = this->loader->valData(); + size_t cap = this->loader->capacity(); + + for (size_t i = 0; i < cap; i++) { + row_data[capacity_] = row_data[i]; + val_data[capacity_] = val_data[i]; + capacity_ += val_data[capacity_] != 0; + } + } + return true; + } + + // Number of loaded entries available + uint32_t capacity() const override {return capacity_;} +}; + +} // end namespace BPCells \ No newline at end of file diff --git a/r/src/bpcells-cpp/matrixIterators/ImportMatrix10xHDF5.cpp b/r/src/bpcells-cpp/matrixIterators/ImportMatrix10xHDF5.cpp new file mode 100644 index 00000000..ddc3b9df --- /dev/null +++ b/r/src/bpcells-cpp/matrixIterators/ImportMatrix10xHDF5.cpp @@ -0,0 +1,226 @@ +// Copyright 2024 BPCells contributors +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#include "ImportMatrix10xHDF5.h" +#include "../arrayIO/array_interfaces.h" +#include "../arrayIO/vector.h" + +namespace BPCells { + +// Reader interfaces for 10x and AnnData matrices +template +std::unique_ptr> open10xFeatureMatrix( + std::string file, + std::string group, + uint32_t buffer_size, + std::unique_ptr &&row_names, + std::unique_ptr &&col_names, + uint32_t read_size +) { + H5ReaderBuilder rb(file, group, buffer_size, read_size); + if (rb.getGroup().exist("features/id")) { + // Most up-to-date matrix format (cellranger V3+) + uint32_t rows = rb.openUIntReader("shape").read_one(); + if (row_names.get() == nullptr) { + row_names = rb.openStringReader("features/id"); + } + if (col_names.get() == nullptr) { + col_names = rb.openStringReader("barcodes"); + } + return std::make_unique>( + rb.openULongReader("indices").convert(), + rb.open("data"), + rb.openULongReader("indptr"), + rows, + std::move(row_names), + std::move(col_names) + ); + } + + // Older-style 10x matrix format + uint32_t rows = rb.openUIntReader("shape").read_one(); + if (row_names.get() == nullptr) { + row_names = rb.openStringReader("genes"); + } + if (col_names.get() == nullptr) { + col_names = rb.openStringReader("barcodes"); + } + return std::make_unique>( + rb.openULongReader("indices").convert(), + rb.open("data"), + rb.openULongReader("indptr"), + rows, + std::move(row_names), + std::move(col_names) + ); +} + +template +std::unique_ptr> open10xFeatureMatrix( + std::string file, std::string group, uint32_t buffer_size, uint32_t read_size +) { + return open10xFeatureMatrix( + file, + group, + buffer_size, + std::unique_ptr(nullptr), + std::unique_ptr(nullptr), + read_size + ); +} + +template +StoredMatrixWriter create10xFeatureMatrix( + std::string file_path, + StringReader &&barcodes, + StringReader &&feature_ids, + StringReader &&feature_names, + StringReader &&feature_types, + const std::map> &feature_metadata, + uint32_t buffer_size, + uint32_t chunk_size, + uint32_t gzip_level +) { + H5WriterBuilder wb(file_path, "matrix", buffer_size, chunk_size, false, gzip_level); + + wb.createStringWriter("barcodes")->write(barcodes); + wb.createStringWriter("features/id")->write(feature_ids); + wb.createStringWriter("features/name")->write(feature_names); + wb.createStringWriter("features/feature_type")->write(feature_types); + std::vector tag_keys; + for (const auto &[key, value] : feature_metadata) { + wb.createStringWriter(std::string("features/") + key)->write(*value); + tag_keys.push_back(key); + } + wb.createStringWriter("features/_all_tag_keys")->write(VecStringReader(tag_keys)); + + return StoredMatrixWriter( + wb.createULongWriter("indices").convert(), + wb.create("data"), + wb.createULongWriter("indptr"), + wb.createUIntWriter("shape"), + std::make_unique(), + std::make_unique(), + std::make_unique() + ); +} + +// Explicit template instantiations for 10x matrix +template std::unique_ptr> open10xFeatureMatrix( + std::string file, + std::string group, + uint32_t buffer_size, + std::unique_ptr &&row_names, + std::unique_ptr &&col_names, + uint32_t read_size +); +template std::unique_ptr> open10xFeatureMatrix( + std::string file, + std::string group, + uint32_t buffer_size, + std::unique_ptr &&row_names, + std::unique_ptr &&col_names, + uint32_t read_size +); +template std::unique_ptr> open10xFeatureMatrix( + std::string file, + std::string group, + uint32_t buffer_size, + std::unique_ptr &&row_names, + std::unique_ptr &&col_names, + uint32_t read_size +); +template std::unique_ptr> open10xFeatureMatrix( + std::string file, + std::string group, + uint32_t buffer_size, + std::unique_ptr &&row_names, + std::unique_ptr &&col_names, + uint32_t read_size +); + +template std::unique_ptr> open10xFeatureMatrix( + std::string file, std::string group, uint32_t buffer_size, uint32_t read_size +); +template std::unique_ptr> open10xFeatureMatrix( + std::string file, std::string group, uint32_t buffer_size, uint32_t read_size +); +template std::unique_ptr> open10xFeatureMatrix( + std::string file, std::string group, uint32_t buffer_size, uint32_t read_size +); +template std::unique_ptr> open10xFeatureMatrix( + std::string file, std::string group, uint32_t buffer_size, uint32_t read_size +); + +template StoredMatrixWriter create10xFeatureMatrix( + std::string file_path, + StringReader &&barcodes, + StringReader &&feature_ids, + StringReader &&feature_names, + StringReader &&feature_types, + const std::map> &feature_metadata, + uint32_t buffer_size, + uint32_t chunk_size, + uint32_t gzip_level +); +template StoredMatrixWriter create10xFeatureMatrix( + std::string file_path, + StringReader &&barcodes, + StringReader &&feature_ids, + StringReader &&feature_names, + StringReader &&feature_types, + const std::map> &feature_metadata, + uint32_t buffer_size, + uint32_t chunk_size, + uint32_t gzip_level +); +template StoredMatrixWriter create10xFeatureMatrix( + std::string file_path, + StringReader &&barcodes, + StringReader &&feature_ids, + StringReader &&feature_names, + StringReader &&feature_types, + const std::map> &feature_metadata, + uint32_t buffer_size, + uint32_t chunk_size, + uint32_t gzip_level +); +template StoredMatrixWriter create10xFeatureMatrix( + std::string file_path, + StringReader &&barcodes, + StringReader &&feature_ids, + StringReader &&feature_names, + StringReader &&feature_types, + const std::map> &feature_metadata, + uint32_t buffer_size, + uint32_t chunk_size, + uint32_t gzip_level +); +// End explicit template instantiations for 10x matrix + +std::string get10xMatrixType(std::string file, std::string group) { + H5ReaderBuilder rb(file, group, 1024, 1024); + HighFive::SilenceHDF5 s; + HighFive::DataType t = rb.getGroup().getDataSet("data").getDataType(); + if (t == HighFive::AtomicType()) { + return "uint32_t"; + } else if (t == HighFive::AtomicType()) { + return "uint32_t"; + } else if (t == HighFive::AtomicType()) { + return "uint64_t"; + } else if (t == HighFive::AtomicType()) { + return "float"; + } else if (t == HighFive::AtomicType()) { + return "double"; + } + throw std::runtime_error( + "get10xMatrixType: unrecognized type for group " + group + " in file " + file + ); +} + +} // end namespace BPCells diff --git a/r/src/bpcells-cpp/matrixIterators/ImportMatrix10xHDF5.h b/r/src/bpcells-cpp/matrixIterators/ImportMatrix10xHDF5.h new file mode 100644 index 00000000..dfc27196 --- /dev/null +++ b/r/src/bpcells-cpp/matrixIterators/ImportMatrix10xHDF5.h @@ -0,0 +1,49 @@ +// Copyright 2024 BPCells contributors +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#pragma once +#include "../arrayIO/hdf5.h" +#include "StoredMatrix.h" +#include "StoredMatrixWriter.h" + + +namespace BPCells { + +// Reader interfaces for 10x and AnnData matrices +// Open 10x matrix should not assume uint32_t +template +std::unique_ptr> open10xFeatureMatrix( + std::string file, std::string group, uint32_t buffer_size, uint32_t read_size = 1024 +); + +template +std::unique_ptr> open10xFeatureMatrix( + std::string file, + std::string group, + uint32_t buffer_size, + std::unique_ptr &&row_names, + std::unique_ptr &&col_names, + uint32_t read_size = 1024 +); + +template +StoredMatrixWriter create10xFeatureMatrix( + std::string file, + StringReader &&barcodes, + StringReader &&feature_ids, + StringReader &&feature_names, + StringReader &&feature_types, + const std::map> &feature_metadata, + uint32_t buffer_size, + uint32_t chunk_size, + uint32_t gzip_level +); + +std::string get10xMatrixType(std::string file, std::string group); + +} // end namespace BPCells \ No newline at end of file diff --git a/r/src/bpcells-cpp/matrixIterators/ImportMatrixAnnDataHDF5.cpp b/r/src/bpcells-cpp/matrixIterators/ImportMatrixAnnDataHDF5.cpp new file mode 100644 index 00000000..1bca96a8 --- /dev/null +++ b/r/src/bpcells-cpp/matrixIterators/ImportMatrixAnnDataHDF5.cpp @@ -0,0 +1,620 @@ +// Copyright 2022 BPCells contributors +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +#include "ImportMatrixAnnDataHDF5.h" +#include "../arrayIO/array_interfaces.h" +#include "../arrayIO/vector.h" +#include "FilterZeros.h" + +namespace BPCells { + +namespace { +class AnnDataDimInfo { +public: + std::vector dims; + bool row_major; + bool is_sparse; +}; +} + +// Helper class for reading AnnData dense matrix values in transposed order. +// Given a 2D dataset matrix, outputs values row-by-row +template class H5AnnDataDenseValReader : public BulkNumReader { + private: + HighFive::DataSet d_; + HighFive::DataType type_; + const uint64_t rows_; + const uint64_t cols_; + uint64_t cur_row_ = 0, cur_col_ = 0; + + public: + H5AnnDataDenseValReader(HighFive::DataSet &&d) + : d_(std::move(d)) + , type_(d_.getDataType()) + , rows_(d_.getDimensions()[0]) + , cols_(d_.getDimensions()[1]) {} + + // Return total number of integers in the reader + uint64_t size() const override { return rows_ * cols_; } + + // Change the next load to start at index pos + void seek(uint64_t pos) override { + cur_row_ = pos / cols_; + cur_col_ = pos % cols_; + } + + // Copy up to `count` integers into `out`, returning the actual number copied. + // Will always load >0 if count is >0 + // Note: It is the caller's responsibility to ensure there is no data overflow, i.e. + // that a load does not try to read past size() total elements + uint64_t load(T *out, uint64_t count) override { + if (cur_row_ >= rows_) return 0; + count = std::min(count, cols_ - cur_col_); + d_.select({cur_row_, cur_col_}, {1, count}).read_raw(out, type_); + cur_col_ += count; + if (cur_col_ == cols_) { + cur_col_ = 0; + cur_row_ += 1; + } + return count; + } +}; + +// Helper class for reading AnnData dense matrix values in transposed order. +// Given dimensions of a 2D dataset matrix, output numbers [0,cols) rows times. +class H5AnnDataDenseRowReader : public BulkNumReader { + private: + const uint64_t rows_; + const uint64_t cols_; + uint64_t cur_row_ = 0, cur_col_ = 0; + + public: + H5AnnDataDenseRowReader(uint64_t rows, uint64_t cols) : rows_(rows), cols_(cols) {} + // Return total number of integers in the reader + uint64_t size() const override { return rows_ * cols_; } + + // Change the next load to start at index pos + void seek(uint64_t pos) override { + cur_row_ = pos / cols_; + cur_col_ = pos % cols_; + } + + // Copy up to `count` integers into `out`, returning the actual number copied. + // Will always load >0 if count is >0 + // Note: It is the caller's responsibility to ensure there is no data overflow, i.e. + // that a load does not try to read past size() total elements + uint64_t load(uint32_t *out, uint64_t count) override { + if (cur_row_ >= rows_) return 0; + count = std::min(count, cols_ - cur_col_); + for (uint64_t i = 0; i < count; i++) { + out[i] = i + cur_col_; + } + cur_col_ += count; + if (cur_col_ == cols_) { + cur_col_ = 0; + cur_row_ += 1; + } + return count; + } +}; + +// Helper class for reading AnnData dense matrix values in transposed order. +// Given dimensions of a 2D dataset matrix, output numbers 0, cols, 2*cols, ..., rows*cols +class H5AnnDataDenseColPtrReader : public BulkNumReader { + private: + const uint64_t rows_; + const uint64_t cols_; + uint64_t cur_row_ = 0; + + public: + H5AnnDataDenseColPtrReader(uint64_t rows, uint64_t cols) : rows_(rows), cols_(cols) {} + // Return total number of integers in the reader + uint64_t size() const override { return rows_ + 1; } + + // Change the next load to start at index pos + void seek(uint64_t pos) override { cur_row_ = pos; } + + // Copy up to `count` integers into `out`, returning the actual number copied. + // Will always load >0 if count is >0 + // Note: It is the caller's responsibility to ensure there is no data overflow, i.e. + // that a load does not try to read past size() total elements + uint64_t load(uint64_t *out, uint64_t count) override { + if (cur_row_ > rows_) return 0; + count = std::min(count, rows_ + 1 - cur_row_); + for (uint64_t i = 0; i < count; i++) { + out[i] = cols_ * (cur_row_ + i); + } + cur_row_ += count; + return count; + } +}; + +void assertAnnDataSparse(std::string file, std::string group) { + // Check for a dense matrix where we expect a sparse matrix + HighFive::File f = openH5ForReading(file); + auto node_type = f.getObjectType(group); + if (node_type == HighFive::ObjectType::Dataset) { + throw std::runtime_error( + "Error in opening AnnData matrix: \"" + group + + "\" is a dataset rather than a group. This likely indicates a dense matrix which " + "is not yet supported by BPCells." + ); + } +} + +// Read either obs names or var names from AnnData file +// Args: +// root: Root group object of file +// axis: Name of axis to read (either "obs" or "var") +std::unique_ptr readAnnDataDimname(HighFive::Group &root, std::string axis) { + if (root.getObjectType(axis) == HighFive::ObjectType::Dataset) { + // Support for legacy format + std::vector name_data; + readMember(root.getDataSet(axis), "index", name_data); + return std::make_unique(name_data); + } else { + std::string dim_ids; + root.getGroup(axis).getAttribute("_index").read(dim_ids); + dim_ids = axis + "/" + dim_ids; + return std::make_unique(root, dim_ids); + } +} + +// Read length of either obs or var dimension from AnnData file +// Args: +// root: Root group object of file +// axis: Name of axis to read (either "obs" or "var") +size_t readAnnDataDimnameLength(HighFive::Group &root, std::string axis) { + std::vector dims; + if (root.getObjectType(axis) == HighFive::ObjectType::Dataset) { + // Support for legacy format + dims = root.getDataSet(axis).getDimensions(); + } else { + std::string dim_ids; + root.getGroup(axis).getAttribute("_index").read(dim_ids); + dim_ids = axis + "/" + dim_ids; + dims = root.getDataSet(dim_ids).getDimensions(); + } + if (dims.size() != 1) + throw std::runtime_error( + std::string("readAnnDataDimname(): expected ") + axis + + " index to be 1-dimensional array." + ); + return dims[0]; +} + +// Read dims and dimnames for an Anndata matrix +// Args: +// file: File with read access +// group: string path of matrix to read within file +// Name matching is done based on dimension length. If obs and var have same length, name +// mis-assignments may occur. +AnnDataDimInfo readAnnDataDims( + HighFive::File &file, + std::string group +) { + AnnDataDimInfo ret; + + auto node_type = file.getObjectType(group); + if (node_type == HighFive::ObjectType::Dataset) { + HighFive::DataSet d = file.getDataSet(group); + std::vector dataset_dims = d.getDimensions(); + + ret.dims.resize(dataset_dims.size()); + for (size_t i = 0; i < ret.dims.size(); i++) + ret.dims[i] = dataset_dims[i]; + ret.row_major = true; + ret.is_sparse = false; + } else { + HighFive::Group g = file.getGroup(group); + std::string encoding; + if (g.hasAttribute("h5sparse_format")) { + // Support for legacy format + g.getAttribute("h5sparse_format").read(encoding); + g.getAttribute("h5sparse_shape").read(ret.dims); + encoding += "_matrix"; + } else if (g.hasAttribute("encoding-type")) { + g.getAttribute("encoding-type").read(encoding); + g.getAttribute("shape").read(ret.dims); + } else { + throw std::runtime_error( + "readAnnDataDims(): Could not detect matrix encoding-type on group: " + group + ); + } + + if (encoding == "csr_matrix") { + ret.row_major = true; + } else if (encoding == "csc_matrix") { + ret.row_major = false; + } else { + throw std::runtime_error("readAnnDataDims(): Unsupported matrix encoding: " + encoding); + } + ret.is_sparse = true; + } + + if (ret.dims.size() != 2) { + throw std::runtime_error( + "Error in opening AnnData matrix: \"" + group + "\" has " + + std::to_string(ret.dims.size()) + " dimensions rather than the required two." + ); + } + return ret; +} + +// Read AnnData sparse matrix, with an implicit transpose to CSC format for +// any data stored in CSR format. +// Row/col names are handled as follows: +// - If row_names or col_names are provided, they are assumed to already have taken into +// account the internal transpose that will hapen for row-major matrices. i.e. for +// a row-major X matrix, row_names should be same length as `var/_index` +// - If row_names or col_names are not provided, they are optimistically inferred by +// length matching dimensions with the `obs` and `var` index names. This is a compromise +// to better infer `obsm`, `varm`, `obsp` and `varp` matrix names without having to +// inspect sub-objects in a way that might break embedded AnnData e.g. with MuData. +// If `len(obs)` == `len(var)` we just don't infer names for safety. +template +std::unique_ptr> openAnnDataMatrix( + std::string file, + std::string group, + uint32_t buffer_size, + std::unique_ptr &&row_names, + std::unique_ptr &&col_names, + uint32_t read_size +) { + AnnDataDimInfo info; + HighFive::SilenceHDF5 s; + + if (group == "") group = "/"; + + // Read dims and dimnames + { + HighFive::File h5file = openH5ForReading(file); + info = readAnnDataDims(h5file, group); + HighFive::Group root = h5file.getGroup("/"); + size_t obs_len = readAnnDataDimnameLength(root, "obs"); + size_t var_len = readAnnDataDimnameLength(root, "var"); + + if (info.row_major) std::swap(row_names, col_names); + + // When searching for names, match based on length to obs and var dimensions if required + if (row_names.get() == nullptr && obs_len != var_len) { + if (obs_len == info.dims[0]) row_names = readAnnDataDimname(root, "obs"); + else if (var_len == info.dims[0]) row_names = readAnnDataDimname(root, "var"); + else row_names = std::make_unique(std::vector()); + } + + if (col_names.get() == nullptr && obs_len != var_len) { + if (obs_len == info.dims[1]) col_names = readAnnDataDimname(root, "obs"); + else if (var_len == info.dims[1]) col_names = readAnnDataDimname(root, "var"); + else col_names = std::make_unique(std::vector()); + } + + if (info.row_major) std::swap(row_names, col_names); + } + + if (info.is_sparse) { + H5ReaderBuilder rb(file, group, 1024, 1024); + return std::make_unique>( + rb.openUIntReader("indices"), + rb.open("data"), + rb.openULongReader("indptr"), + info.row_major ? info.dims[1] : info.dims[0], + std::move(row_names), + std::move(col_names) + ); + } else { + HighFive::File h5file = openH5ForReading(file); + auto res = std::make_unique>( + NumReader( + std::make_unique(info.dims[0], info.dims[1]), 1024, 1024 + ), + NumReader( + std::make_unique>(h5file.getDataSet(group)), 1024, 1024 + ), + NumReader( + std::make_unique(info.dims[0], info.dims[1]), 1024, 1024 + ), + info.row_major ? info.dims[1] : info.dims[0], + std::move(row_names), + std::move(col_names) + ); + return std::make_unique>(std::move(res)); + } +} + +template +std::unique_ptr> +openAnnDataMatrix(std::string file, std::string group, uint32_t buffer_size, uint32_t read_size) { + return openAnnDataMatrix( + file, + group, + buffer_size, + std::unique_ptr(nullptr), + std::unique_ptr(nullptr), + read_size + ); +} + +// Convenience class for writing shape attribute +template class H5AttributeNumWriter : public BulkNumWriter { + private: + HighFive::Group g; + std::string attribute_name; + std::vector data; + + public: + H5AttributeNumWriter(HighFive::Group g, std::string attribute_name) + : g(g) + , attribute_name(attribute_name) {} + + uint64_t write(T *in, uint64_t count) override { + data.insert(data.end(), in, in + count); + return count; + } + + void finalize() override { g.createAttribute(attribute_name, data); } +}; + +template +StoredMatrixWriter createAnnDataMatrix( + std::string file, + std::string group, + bool row_major, + uint32_t buffer_size, + uint32_t chunk_size, + uint32_t gzip_level +) { + H5WriterBuilder wb(file, group, buffer_size, chunk_size, false, gzip_level); + HighFive::Group &g = wb.getGroup(); + + // See format docs: + // https://anndata.readthedocs.io/en/latest/fileformat-prose.html#sparse-array-specification-v0-1-0 + g.createAttribute("encoding-type", std::string(row_major ? "csr_matrix" : "csc_matrix")); + g.createAttribute("encoding-version", std::string("0.1.0")); + + return StoredMatrixWriter( + wb.createUIntWriter("indices"), + wb.create("data"), + wb.createLongWriter("indptr").convert(), + UIntWriter(std::make_unique>(g, "shape"), 16), + std::make_unique(), + std::make_unique(), + std::make_unique(), + row_major + ); +} + +// Add obs + var metadata to an anndata file if needed. +// Draw from the matrix row/col names if present, or otherwise +// insert dummy identifiers. +template +void createAnnDataObsVarIfMissing( + MatrixLoader &mat, std::string file, bool row_major, uint32_t gzip_level +) { + HighFive::SilenceHDF5 s; + H5WriterBuilder wb(file, "/", 8192, 1024, true, gzip_level); + HighFive::Group &g = wb.getGroup(); + + const std::string index_name = "bpcells_name"; + + std::string row_key = "obs"; + std::string col_key = "var"; + if (row_major) std::swap(row_key, col_key); + + if (!g.exist(row_key)) { + HighFive::Group metadata = g.createGroup(row_key); + metadata.createAttribute("encoding-type", std::string("dataframe")); + metadata.createAttribute("encoding-version", std::string("0.2.0")); + metadata.createAttribute("_index", index_name); + metadata.createAttribute("column-order", std::vector{index_name}); + + std::vector row_names(mat.rows()); + if (mat.rows() > 0 && mat.rowNames(0) != NULL) { + for (uint32_t i = 0; i < mat.rows(); i++) { + row_names[i] = mat.rowNames(i); + } + } else { + for (uint32_t i = 0; i < mat.rows(); i++) { + row_names[i] = std::to_string(i); + } + } + + wb.createStringWriter(row_key + "/" + index_name)->write(VecStringReader(row_names)); + } + if (!g.exist(col_key)) { + HighFive::Group metadata = g.createGroup(col_key); + metadata.createAttribute("encoding-type", std::string("dataframe")); + metadata.createAttribute("encoding-version", std::string("0.2.0")); + metadata.createAttribute("_index", index_name); + metadata.createAttribute("column-order", std::vector{index_name}); + + std::vector col_names(mat.cols()); + if (mat.cols() > 0 && mat.colNames(0) != NULL) { + for (uint32_t i = 0; i < mat.cols(); i++) { + col_names[i] = mat.colNames(i); + } + } else { + for (uint32_t i = 0; i < mat.cols(); i++) { + col_names[i] = std::to_string(i); + } + } + + wb.createStringWriter(col_key + "/" + index_name)->write(VecStringReader(col_names)); + } +} + +// Explicit template instantiations +template std::unique_ptr> openAnnDataMatrix( + std::string file, + std::string group, + uint32_t buffer_size, + std::unique_ptr &&row_names, + std::unique_ptr &&col_names, + uint32_t read_size +); +template std::unique_ptr> openAnnDataMatrix( + std::string file, + std::string group, + uint32_t buffer_size, + std::unique_ptr &&row_names, + std::unique_ptr &&col_names, + uint32_t read_size +); +template std::unique_ptr> openAnnDataMatrix( + std::string file, + std::string group, + uint32_t buffer_size, + std::unique_ptr &&row_names, + std::unique_ptr &&col_names, + uint32_t read_size +); +template std::unique_ptr> openAnnDataMatrix( + std::string file, + std::string group, + uint32_t buffer_size, + std::unique_ptr &&row_names, + std::unique_ptr &&col_names, + uint32_t read_size +); + +template std::unique_ptr> openAnnDataMatrix( + std::string file, std::string group, uint32_t buffer_size, uint32_t read_size +); +template std::unique_ptr> openAnnDataMatrix( + std::string file, std::string group, uint32_t buffer_size, uint32_t read_size +); +template std::unique_ptr> openAnnDataMatrix( + std::string file, std::string group, uint32_t buffer_size, uint32_t read_size +); +template std::unique_ptr> openAnnDataMatrix( + std::string file, std::string group, uint32_t buffer_size, uint32_t read_size +); + +template StoredMatrixWriter createAnnDataMatrix( + std::string file, + std::string group, + bool row_major, + uint32_t buffer_size, + uint32_t chunk_size, + uint32_t gzip_level +); +template StoredMatrixWriter createAnnDataMatrix( + std::string file, + std::string group, + bool row_major, + uint32_t buffer_size, + uint32_t chunk_size, + uint32_t gzip_level +); +template StoredMatrixWriter createAnnDataMatrix( + std::string file, + std::string group, + bool row_major, + uint32_t buffer_size, + uint32_t chunk_size, + uint32_t gzip_level +); +template StoredMatrixWriter createAnnDataMatrix( + std::string file, + std::string group, + bool row_major, + uint32_t buffer_size, + uint32_t chunk_size, + uint32_t gzip_level +); + +template void createAnnDataObsVarIfMissing( + MatrixLoader &mat, std::string file, bool row_major, uint32_t gzip_level +); +template void createAnnDataObsVarIfMissing( + MatrixLoader &mat, std::string file, bool row_major, uint32_t gzip_level +); +template void createAnnDataObsVarIfMissing( + MatrixLoader &mat, std::string file, bool row_major, uint32_t gzip_level +); +template void createAnnDataObsVarIfMissing( + MatrixLoader &mat, std::string file, bool row_major, uint32_t gzip_level +); +// End explicit template instantiations + +std::string getAnnDataMatrixType(std::string file, std::string group) { + HighFive::SilenceHDF5 s; + HighFive::File h5file = openH5ForReading(file); + if (group == "") group = "/"; + + HighFive::DataType t; + if (h5file.getObjectType(group) == HighFive::ObjectType::Group) { + t = h5file.getGroup(group).getDataSet("data").getDataType(); + } else { + t = h5file.getDataSet(group).getDataType(); + } + + if (t == HighFive::AtomicType()) { + return "uint32_t"; + } else if (t == HighFive::AtomicType()) { + return "uint32_t"; + } else if (t == HighFive::AtomicType()) { + return "uint64_t"; + } else if (t == HighFive::AtomicType()) { + return "float"; + } else if (t == HighFive::AtomicType()) { + return "double"; + } + throw std::runtime_error( + "getAnnDataMatrixType: unrecognized type for group " + group + " in file " + file + ); +} + +bool isRowOrientedAnnDataMatrix(std::string file, std::string group) { + HighFive::SilenceHDF5 s; + HighFive::File h5file = openH5ForReading(file); + return readAnnDataDims(h5file, group).row_major; +} + +template +void readMember(HighFive::DataSet &&dataset, std::string name, std::vector &out) { + auto dims = dataset.getDimensions(); + if (dims.size() != 1) { + std::ostringstream ss; + ss << "readMember: dims must be 1, found " << dims.size(); + throw std::runtime_error(ss.str()); + } + HighFive::CompoundType base_type = dataset.getDataType(); + HighFive::DataType field_type = HighFive::create_datatype(); + bool found = false; + // Error checking that the type exists + for (auto &t : base_type.getMembers()) { + if (t.name == name) { + if (t.base_type.getClass() == field_type.getClass()) { + found = true; + break; + } + std::ostringstream ss; + ss << "Type of member \"" << name << "\" in file (" << t.base_type.string() + << ") does not match class of requested type (" << field_type.string() << ")"; + throw HighFive::DataTypeException(ss.str()); + } + } + if (!found) { + std::ostringstream ss; + ss << "Member \"" << name << "\" not found in compound data type"; + throw HighFive::DataTypeException(ss.str()); + } + // Since HDF5 is pretty good about compatibility, we can ask for a "type" that has just the one + // field we care about + HighFive::CompoundType subtype({{name.c_str(), field_type}}); + + out.resize(dims[0]); + // Use a data-converter like the fancy read function (makes string conversion etc. work) + auto r = HighFive::details::data_converter::get_reader>(dims, out, field_type); + dataset.read_raw(r.getPointer(), subtype); + // re-arrange results + r.unserialize(out); +} + +} // end namespace BPCells diff --git a/r/src/bpcells-cpp/matrixIterators/ImportMatrixHDF5.h b/r/src/bpcells-cpp/matrixIterators/ImportMatrixAnnDataHDF5.h similarity index 52% rename from r/src/bpcells-cpp/matrixIterators/ImportMatrixHDF5.h rename to r/src/bpcells-cpp/matrixIterators/ImportMatrixAnnDataHDF5.h index 39855fba..e1a305fc 100644 --- a/r/src/bpcells-cpp/matrixIterators/ImportMatrixHDF5.h +++ b/r/src/bpcells-cpp/matrixIterators/ImportMatrixAnnDataHDF5.h @@ -7,70 +7,32 @@ // except according to those terms. #pragma once -#include "../arrayIO/array_interfaces.h" #include "../arrayIO/hdf5.h" -#include "../arrayIO/vector.h" #include "StoredMatrix.h" #include "StoredMatrixWriter.h" namespace BPCells { -// Helper class for when we need to fake reading a single integer from memory -template class SingletonNumReader : public BulkNumReader { - private: - T num; - bool read = false; - - public: - SingletonNumReader(T num) : num(num) {} - uint64_t size() const override { return 1; } - void seek(uint64_t pos) override { read = pos > 0; } - uint64_t load(T *out, uint64_t count) override { - if (read) return 0; - out[0] = num; - return 1; - } -}; - -// Reader interfaces for 10x and AnnData matrices -// Open 10x matrix should not assume uint32_t -template -StoredMatrix open10xFeatureMatrix( - std::string file, std::string group, uint32_t buffer_size, uint32_t read_size = 1024 -); - -template -StoredMatrix open10xFeatureMatrix( - std::string file, - std::string group, - uint32_t buffer_size, - std::unique_ptr &&row_names, - std::unique_ptr &&col_names, - uint32_t read_size = 1024 -); - -template -StoredMatrixWriter create10xFeatureMatrix( - std::string file, - StringReader &&barcodes, - StringReader &&feature_ids, - StringReader &&feature_names, - StringReader &&feature_types, - const std::map> &feature_metadata, - uint32_t buffer_size, - uint32_t chunk_size, - uint32_t gzip_level -); - // Read AnnData sparse matrix, with an implicit transpose to CSC format for // any data stored in CSR format template -StoredMatrix openAnnDataMatrix( +std::unique_ptr> openAnnDataMatrix( std::string file, std::string group, uint32_t buffer_size, uint32_t read_size = 1024 ); +// Read AnnData sparse matrix, with an implicit transpose to CSC format for +// any data stored in CSR format. +// Row/col names are handled as follows: +// - If row_names or col_names are provided, they are assumed to already have taken into +// account the internal transpose that will hapen for row-major matrices. i.e. for +// a row-major X matrix, row_names should be same length as `var/_index` +// - If row_names or col_names are not provided, they are optimistically inferred by +// length matching dimensions with the `obs` and `var` index names. This is a compromise +// to better infer `obsm`, `varm`, `obsp` and `varp` matrix names without having to +// inspect sub-objects in a way that might break embedded AnnData e.g. with MuData. +// If `len(obs)` == `len(var)` we just don't infer names for safety. template -StoredMatrix openAnnDataMatrix( +std::unique_ptr> openAnnDataMatrix( std::string file, std::string group, uint32_t buffer_size, @@ -99,7 +61,6 @@ void createAnnDataObsVarIfMissing( ); std::string getAnnDataMatrixType(std::string file, std::string group); -std::string get10xMatrixType(std::string file, std::string group); bool isRowOrientedAnnDataMatrix(std::string file, std::string group); diff --git a/r/src/bpcells-cpp/matrixIterators/ImportMatrixHDF5.cpp b/r/src/bpcells-cpp/matrixIterators/ImportMatrixHDF5.cpp deleted file mode 100644 index a5525df8..00000000 --- a/r/src/bpcells-cpp/matrixIterators/ImportMatrixHDF5.cpp +++ /dev/null @@ -1,629 +0,0 @@ -// Copyright 2022 BPCells contributors -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -#include "ImportMatrixHDF5.h" - -namespace BPCells { - -// Reader interfaces for 10x and AnnData matrices -template -StoredMatrix open10xFeatureMatrix( - std::string file, - std::string group, - uint32_t buffer_size, - std::unique_ptr &&row_names, - std::unique_ptr &&col_names, - uint32_t read_size -) { - H5ReaderBuilder rb(file, group, buffer_size, read_size); - if (rb.getGroup().exist("features/id")) { - // Most up-to-date matrix format (cellranger V3+) - uint32_t rows = rb.openUIntReader("shape").read_one(); - if (row_names.get() == nullptr) { - row_names = rb.openStringReader("features/id"); - } - if (col_names.get() == nullptr) { - col_names = rb.openStringReader("barcodes"); - } - return StoredMatrix( - rb.openULongReader("indices").convert(), - rb.open("data"), - rb.openULongReader("indptr"), - rows, - std::move(row_names), - std::move(col_names) - ); - } - - // Older-style 10x matrix format - uint32_t rows = rb.openUIntReader("shape").read_one(); - if (row_names.get() == nullptr) { - row_names = rb.openStringReader("genes"); - } - if (col_names.get() == nullptr) { - col_names = rb.openStringReader("barcodes"); - } - return StoredMatrix( - rb.openULongReader("indices").convert(), - rb.open("data"), - rb.openULongReader("indptr"), - rows, - std::move(row_names), - std::move(col_names) - ); -} - -template -StoredMatrix open10xFeatureMatrix( - std::string file, std::string group, uint32_t buffer_size, uint32_t read_size -) { - return open10xFeatureMatrix( - file, - group, - buffer_size, - std::unique_ptr(nullptr), - std::unique_ptr(nullptr), - read_size - ); -} - -template -StoredMatrixWriter create10xFeatureMatrix( - std::string file_path, - StringReader &&barcodes, - StringReader &&feature_ids, - StringReader &&feature_names, - StringReader &&feature_types, - const std::map> &feature_metadata, - uint32_t buffer_size, - uint32_t chunk_size, - uint32_t gzip_level -) { - H5WriterBuilder wb(file_path, "matrix", buffer_size, chunk_size, false, gzip_level); - - wb.createStringWriter("barcodes")->write(barcodes); - wb.createStringWriter("features/id")->write(feature_ids); - wb.createStringWriter("features/name")->write(feature_names); - wb.createStringWriter("features/feature_type")->write(feature_types); - std::vector tag_keys; - for (const auto &[key, value] : feature_metadata) { - wb.createStringWriter(std::string("features/") + key)->write(*value); - tag_keys.push_back(key); - } - wb.createStringWriter("features/_all_tag_keys")->write(VecStringReader(tag_keys)); - - return StoredMatrixWriter( - wb.createULongWriter("indices").convert(), - wb.create("data"), - wb.createULongWriter("indptr"), - wb.createUIntWriter("shape"), - std::make_unique(), - std::make_unique(), - std::make_unique() - ); -} - -// Explicit template instantiations for 10x matrix -template StoredMatrix open10xFeatureMatrix( - std::string file, - std::string group, - uint32_t buffer_size, - std::unique_ptr &&row_names, - std::unique_ptr &&col_names, - uint32_t read_size -); -template StoredMatrix open10xFeatureMatrix( - std::string file, - std::string group, - uint32_t buffer_size, - std::unique_ptr &&row_names, - std::unique_ptr &&col_names, - uint32_t read_size -); -template StoredMatrix open10xFeatureMatrix( - std::string file, - std::string group, - uint32_t buffer_size, - std::unique_ptr &&row_names, - std::unique_ptr &&col_names, - uint32_t read_size -); -template StoredMatrix open10xFeatureMatrix( - std::string file, - std::string group, - uint32_t buffer_size, - std::unique_ptr &&row_names, - std::unique_ptr &&col_names, - uint32_t read_size -); - -template StoredMatrix open10xFeatureMatrix( - std::string file, std::string group, uint32_t buffer_size, uint32_t read_size -); -template StoredMatrix open10xFeatureMatrix( - std::string file, std::string group, uint32_t buffer_size, uint32_t read_size -); -template StoredMatrix open10xFeatureMatrix( - std::string file, std::string group, uint32_t buffer_size, uint32_t read_size -); -template StoredMatrix open10xFeatureMatrix( - std::string file, std::string group, uint32_t buffer_size, uint32_t read_size -); - -template StoredMatrixWriter create10xFeatureMatrix( - std::string file_path, - StringReader &&barcodes, - StringReader &&feature_ids, - StringReader &&feature_names, - StringReader &&feature_types, - const std::map> &feature_metadata, - uint32_t buffer_size, - uint32_t chunk_size, - uint32_t gzip_level -); -template StoredMatrixWriter create10xFeatureMatrix( - std::string file_path, - StringReader &&barcodes, - StringReader &&feature_ids, - StringReader &&feature_names, - StringReader &&feature_types, - const std::map> &feature_metadata, - uint32_t buffer_size, - uint32_t chunk_size, - uint32_t gzip_level -); -template StoredMatrixWriter create10xFeatureMatrix( - std::string file_path, - StringReader &&barcodes, - StringReader &&feature_ids, - StringReader &&feature_names, - StringReader &&feature_types, - const std::map> &feature_metadata, - uint32_t buffer_size, - uint32_t chunk_size, - uint32_t gzip_level -); -template StoredMatrixWriter create10xFeatureMatrix( - std::string file_path, - StringReader &&barcodes, - StringReader &&feature_ids, - StringReader &&feature_names, - StringReader &&feature_types, - const std::map> &feature_metadata, - uint32_t buffer_size, - uint32_t chunk_size, - uint32_t gzip_level -); -// End explicit template instantiations for 10x matrix - -void assertAnnDataSparse(std::string file, std::string group) { - // Check for a dense matrix where we expect a sparse matrix - HighFive::File f = openH5ForReading(file); - auto node_type = f.getObjectType(group); - if (node_type == HighFive::ObjectType::Dataset) { - throw std::runtime_error( - "Error in opening AnnData matrix: \"" + group + - "\" is a dataset rather than a group. This likely indicates a dense matrix which " - "is not yet supported by BPCells." - ); - } -} -// Read AnnData sparse matrix, with an implicit transpose to CSC format for -// any data stored in CSR format -template -StoredMatrix openAnnDataMatrix( - std::string file, - std::string group, - uint32_t buffer_size, - std::unique_ptr &&row_names, - std::unique_ptr &&col_names, - uint32_t read_size -) { - assertAnnDataSparse(file, group); - H5ReaderBuilder rb(file, group, buffer_size, read_size); - - HighFive::SilenceHDF5 s; - HighFive::Group &g = rb.getGroup(); - std::string encoding; - std::vector dims; - - HighFive::Group root = g.getFile().getGroup("/"); - - if (g.hasAttribute("h5sparse_format")) { - // Support for legacy format - g.getAttribute("h5sparse_format").read(encoding); - g.getAttribute("h5sparse_shape").read(dims); - encoding += "_matrix"; - // Make sure we assign to the correct row/col names if some are already provided - if (encoding == "csr_matrix") std::swap(row_names, col_names); - - auto dt = root.getDataSet("var").getDataType(); - - if (row_names.get() == nullptr) { - std::vector row_name_data; - readMember(root.getDataSet("obs"), "index", row_name_data); - // Handle matrices that aren't dims (obs x var) - if (row_name_data.size() != dims[0]) row_name_data.clear(); - row_names = std::make_unique(row_name_data); - } - if (col_names.get() == nullptr) { - std::vector col_name_data; - readMember(root.getDataSet("var"), "index", col_name_data); - // Handle matrices that aren't dims (obs x var) - if (col_name_data.size() != dims[1]) col_name_data.clear(); - col_names = std::make_unique(col_name_data); - } - } else if (g.hasAttribute("encoding-type")) { - g.getAttribute("encoding-type").read(encoding); - g.getAttribute("shape").read(dims); - // Make sure we assign to the correct row/col names if some are already provided - if (encoding == "csr_matrix") std::swap(row_names, col_names); - if (row_names.get() == nullptr) { - std::string row_ids; - root.getGroup("obs").getAttribute("_index").read(row_ids); - row_ids = "obs/" + row_ids; - row_names = std::make_unique(root, row_ids); - // Handle matrices that aren't dims (obs x var) - if (row_names->size() != dims[0]) - row_names = std::make_unique(std::vector()); - } - if (col_names.get() == nullptr) { - std::string col_ids; - root.getGroup("var").getAttribute("_index").read(col_ids); - col_ids = "var/" + col_ids; - col_names = std::make_unique(root, col_ids); - // Handle matrices that aren't dims (obs x var) - if (col_names->size() != dims[1]) - col_names = std::make_unique(std::vector()); - } - } else { - throw std::runtime_error( - "h5ad could not be read - missing attribute 'encoding-type' on sparse matrix" - ); - } - - uint32_t rows; - if (encoding == "csr_matrix") { - rows = dims[1]; - std::swap(row_names, col_names); - } else if (encoding == "csc_matrix") { - rows = dims[0]; - } else throw std::runtime_error("Unsupported matrix encoding: " + encoding); - - return StoredMatrix( - rb.openUIntReader("indices"), - rb.open("data"), - rb.openULongReader("indptr"), - rows, - std::move(row_names), - std::move(col_names) - ); -} - -template -StoredMatrix -openAnnDataMatrix(std::string file, std::string group, uint32_t buffer_size, uint32_t read_size) { - return openAnnDataMatrix( - file, - group, - buffer_size, - std::unique_ptr(nullptr), - std::unique_ptr(nullptr), - read_size - ); -} - -// Convenience class forr writing shape attribute -template class H5AttributeNumWriter : public BulkNumWriter { - private: - HighFive::Group g; - std::string attribute_name; - std::vector data; - - public: - H5AttributeNumWriter(HighFive::Group g, std::string attribute_name) - : g(g) - , attribute_name(attribute_name) {} - - uint64_t write(T *in, uint64_t count) override { - data.insert(data.end(), in, in + count); - return count; - } - - void finalize() override { g.createAttribute(attribute_name, data); } -}; - -template -StoredMatrixWriter createAnnDataMatrix( - std::string file, - std::string group, - bool row_major, - uint32_t buffer_size, - uint32_t chunk_size, - uint32_t gzip_level -) { - H5WriterBuilder wb(file, group, buffer_size, chunk_size, false, gzip_level); - HighFive::Group &g = wb.getGroup(); - - // See format docs: - // https://anndata.readthedocs.io/en/latest/fileformat-prose.html#sparse-array-specification-v0-1-0 - g.createAttribute("encoding-type", std::string(row_major ? "csr_matrix" : "csc_matrix")); - g.createAttribute("encoding-version", std::string("0.1.0")); - - return StoredMatrixWriter( - wb.createUIntWriter("indices"), - wb.create("data"), - wb.createLongWriter("indptr").convert(), - UIntWriter(std::make_unique>(g, "shape"), 16), - std::make_unique(), - std::make_unique(), - std::make_unique(), - row_major - ); -} - -// Add obs + var metadata to an anndata file if needed. -// Draw from the matrix row/col names if present, or otherwise -// insert dummy identifiers. -template -void createAnnDataObsVarIfMissing( - MatrixLoader &mat, std::string file, bool row_major, uint32_t gzip_level -) { - HighFive::SilenceHDF5 s; - H5WriterBuilder wb(file, "/", 8192, 1024, true, gzip_level); - HighFive::Group &g = wb.getGroup(); - - const std::string index_name = "bpcells_name"; - - std::string row_key = "obs"; - std::string col_key = "var"; - if (row_major) std::swap(row_key, col_key); - - if (!g.exist(row_key)) { - HighFive::Group metadata = g.createGroup(row_key); - metadata.createAttribute("encoding-type", std::string("dataframe")); - metadata.createAttribute("encoding-version", std::string("0.2.0")); - metadata.createAttribute("_index", index_name); - metadata.createAttribute("column-order", std::vector{index_name}); - - std::vector row_names(mat.rows()); - if (mat.rows() > 0 && mat.rowNames(0) != NULL) { - for (uint32_t i = 0; i < mat.rows(); i++) { - row_names[i] = mat.rowNames(i); - } - } else { - for (uint32_t i = 0; i < mat.rows(); i++) { - row_names[i] = std::to_string(i); - } - } - - wb.createStringWriter(row_key + "/" + index_name)->write(VecStringReader(row_names)); - } - if (!g.exist(col_key)) { - HighFive::Group metadata = g.createGroup(col_key); - metadata.createAttribute("encoding-type", std::string("dataframe")); - metadata.createAttribute("encoding-version", std::string("0.2.0")); - metadata.createAttribute("_index", index_name); - metadata.createAttribute("column-order", std::vector{index_name}); - - std::vector col_names(mat.cols()); - if (mat.cols() > 0 && mat.colNames(0) != NULL) { - for (uint32_t i = 0; i < mat.cols(); i++) { - col_names[i] = mat.colNames(i); - } - } else { - for (uint32_t i = 0; i < mat.cols(); i++) { - col_names[i] = std::to_string(i); - } - } - - wb.createStringWriter(col_key + "/" + index_name)->write(VecStringReader(col_names)); - } -} - -// Explicit template instantiations -template StoredMatrix openAnnDataMatrix( - std::string file, - std::string group, - uint32_t buffer_size, - std::unique_ptr &&row_names, - std::unique_ptr &&col_names, - uint32_t read_size -); -template StoredMatrix openAnnDataMatrix( - std::string file, - std::string group, - uint32_t buffer_size, - std::unique_ptr &&row_names, - std::unique_ptr &&col_names, - uint32_t read_size -); -template StoredMatrix openAnnDataMatrix( - std::string file, - std::string group, - uint32_t buffer_size, - std::unique_ptr &&row_names, - std::unique_ptr &&col_names, - uint32_t read_size -); -template StoredMatrix openAnnDataMatrix( - std::string file, - std::string group, - uint32_t buffer_size, - std::unique_ptr &&row_names, - std::unique_ptr &&col_names, - uint32_t read_size -); - -template StoredMatrix openAnnDataMatrix( - std::string file, std::string group, uint32_t buffer_size, uint32_t read_size -); -template StoredMatrix openAnnDataMatrix( - std::string file, std::string group, uint32_t buffer_size, uint32_t read_size -); -template StoredMatrix openAnnDataMatrix( - std::string file, std::string group, uint32_t buffer_size, uint32_t read_size -); -template StoredMatrix openAnnDataMatrix( - std::string file, std::string group, uint32_t buffer_size, uint32_t read_size -); - -template StoredMatrixWriter createAnnDataMatrix( - std::string file, - std::string group, - bool row_major, - uint32_t buffer_size, - uint32_t chunk_size, - uint32_t gzip_level -); -template StoredMatrixWriter createAnnDataMatrix( - std::string file, - std::string group, - bool row_major, - uint32_t buffer_size, - uint32_t chunk_size, - uint32_t gzip_level -); -template StoredMatrixWriter createAnnDataMatrix( - std::string file, - std::string group, - bool row_major, - uint32_t buffer_size, - uint32_t chunk_size, - uint32_t gzip_level -); -template StoredMatrixWriter createAnnDataMatrix( - std::string file, - std::string group, - bool row_major, - uint32_t buffer_size, - uint32_t chunk_size, - uint32_t gzip_level -); - -template void createAnnDataObsVarIfMissing( - MatrixLoader &mat, std::string file, bool row_major, uint32_t gzip_level -); -template void createAnnDataObsVarIfMissing( - MatrixLoader &mat, std::string file, bool row_major, uint32_t gzip_level -); -template void createAnnDataObsVarIfMissing( - MatrixLoader &mat, std::string file, bool row_major, uint32_t gzip_level -); -template void createAnnDataObsVarIfMissing( - MatrixLoader &mat, std::string file, bool row_major, uint32_t gzip_level -); -// End explicit template instantiations - -std::string getAnnDataMatrixType(std::string file, std::string group) { - assertAnnDataSparse(file, group); - H5ReaderBuilder rb(file, group, 1024, 1024); - HighFive::SilenceHDF5 s; - HighFive::DataType t = rb.getGroup().getDataSet("data").getDataType(); - if (t == HighFive::AtomicType()) { - return "uint32_t"; - } else if (t == HighFive::AtomicType()) { - return "uint32_t"; - } else if (t == HighFive::AtomicType()) { - return "uint64_t"; - } else if (t == HighFive::AtomicType()) { - return "float"; - } else if (t == HighFive::AtomicType()) { - return "double"; - } - throw std::runtime_error( - "getAnnDataMatrixType: unrecognized type for group " + group + " in file " + file - ); -} - -std::string get10xMatrixType(std::string file, std::string group) { - H5ReaderBuilder rb(file, group, 1024, 1024); - HighFive::SilenceHDF5 s; - HighFive::DataType t = rb.getGroup().getDataSet("data").getDataType(); - if (t == HighFive::AtomicType()) { - return "uint32_t"; - } else if (t == HighFive::AtomicType()) { - return "uint32_t"; - } else if (t == HighFive::AtomicType()) { - return "uint64_t"; - } else if (t == HighFive::AtomicType()) { - return "float"; - } else if (t == HighFive::AtomicType()) { - return "double"; - } - throw std::runtime_error( - "get10xMatrixType: unrecognized type for group " + group + " in file " + file - ); -} - -bool isRowOrientedAnnDataMatrix(std::string file, std::string group) { - assertAnnDataSparse(file, group); - H5ReaderBuilder rb(file, group, 1024, 1024); - - HighFive::SilenceHDF5 s; - HighFive::Group &g = rb.getGroup(); - std::string encoding; - - if (g.hasAttribute("h5sparse_format")) { - // Support for legacy format - g.getAttribute("h5sparse_format").read(encoding); - encoding += "_matrix"; - } else if (g.hasAttribute("encoding-type")) { - g.getAttribute("encoding-type").read(encoding); - } else { - throw std::runtime_error( - "h5ad could not be read - missing attribute 'encoding-type' on sparse matrix" - ); - } - - if (encoding == "csr_matrix") return true; - else if (encoding == "csc_matrix") return false; - else throw std::runtime_error("Unsupported matrix encoding: " + encoding); -} - -template -void readMember(HighFive::DataSet &&dataset, std::string name, std::vector &out) { - auto dims = dataset.getDimensions(); - if (dims.size() != 1) { - std::ostringstream ss; - ss << "readMember: dims must be 1, found " << dims.size(); - throw std::runtime_error(ss.str()); - } - HighFive::CompoundType base_type = dataset.getDataType(); - HighFive::DataType field_type = HighFive::create_datatype(); - bool found = false; - // Error checking that the type exists - for (auto &t : base_type.getMembers()) { - if (t.name == name) { - if (t.base_type.getClass() == field_type.getClass()) { - found = true; - break; - } - std::ostringstream ss; - ss << "Type of member \"" << name << "\" in file (" << t.base_type.string() - << ") does not match class of requested type (" << field_type.string() << ")"; - throw HighFive::DataTypeException(ss.str()); - } - } - if (!found) { - std::ostringstream ss; - ss << "Member \"" << name << "\" not found in compound data type"; - throw HighFive::DataTypeException(ss.str()); - } - // Since HDF5 is pretty good about compatibility, we can ask for a "type" that has just the one field we care about - HighFive::CompoundType subtype({{name.c_str(), field_type}}); - - out.resize(dims[0]); - // Use a data-converter like the fancy read function (makes string conversion etc. work) - auto r = HighFive::details::data_converter::get_reader>(dims, out, field_type); - dataset.read_raw(r.getPointer(), subtype); - // re-arrange results - r.unserialize(out); -} - -} // end namespace BPCells diff --git a/r/src/matrix_io.cpp b/r/src/matrix_io.cpp index 3e8ca8af..6a0c3b8d 100644 --- a/r/src/matrix_io.cpp +++ b/r/src/matrix_io.cpp @@ -10,7 +10,8 @@ #define RCPP_NO_SUGAR #include -#include "bpcells-cpp/matrixIterators/ImportMatrixHDF5.h" +#include "bpcells-cpp/matrixIterators/ImportMatrix10xHDF5.h" +#include "bpcells-cpp/matrixIterators/ImportMatrixAnnDataHDF5.h" #include "bpcells-cpp/matrixIterators/MatrixIterator.h" #include "bpcells-cpp/matrixIterators/MatrixMarketImport.h" #include "bpcells-cpp/matrixIterators/StoredMatrix.h" @@ -29,7 +30,7 @@ using namespace Rcpp; using namespace BPCells; -template List dims_matrix(StoredMatrix &&mat, bool transpose) { +template List dims_matrix(MatrixLoader &mat, bool transpose) { IntegerVector dims(2); uint32_t row_name_count = mat.rows() == 0 || mat.rowNames(0) == NULL ? 0 : mat.rows(); @@ -69,6 +70,14 @@ template List dims_matrix(StoredMatrix &&mat, bool transpose) { } } +// This is a convenience overload to allow calling `dims_matrix` with rvalues. +// It looks weird, but I think it is safe. Long-term, a const l-value reference +// is preferable. +// For some additional info, see: https://www.fluentcpp.com/2018/02/06/understanding-lvalues-rvalues-and-their-references/ +template List dims_matrix(MatrixLoader &&mat, bool transpose) { + return dims_matrix(mat, transpose); +} + List dims_matrix_reader_builder(ReaderBuilder &rb) { std::string version = rb.readVersion(); auto storage_order_reader = rb.openStringReader("storage_order"); @@ -645,13 +654,13 @@ List dims_matrix_10x_hdf5_cpp(std::string file, std::string group, uint32_t buff std::string type = get10xMatrixType(file, group); List l; if (type == "uint32_t") { - l = dims_matrix(open10xFeatureMatrix(file, group, buffer_size), false); + l = dims_matrix(*open10xFeatureMatrix(file, group, buffer_size), false); } else if (type == "uint64_t") { - l = dims_matrix(open10xFeatureMatrix(file, group, buffer_size), false); + l = dims_matrix(*open10xFeatureMatrix(file, group, buffer_size), false); } else if (type == "float") { - l = dims_matrix(open10xFeatureMatrix(file, group, buffer_size), false); + l = dims_matrix(*open10xFeatureMatrix(file, group, buffer_size), false); } else if (type == "double") { - l = dims_matrix(open10xFeatureMatrix(file, group, buffer_size), false); + l = dims_matrix(*open10xFeatureMatrix(file, group, buffer_size), false); } else { throw std::runtime_error("dims_matrix_10x_hdf5_cpp: Unrecognized matrix type " + type); } @@ -669,7 +678,7 @@ SEXP iterate_matrix_10x_hdf5_cpp( ) { std::string type = get10xMatrixType(file, group); if (type == "uint32_t") { - return make_unique_xptr>(open10xFeatureMatrix( + return unique_xptr>(open10xFeatureMatrix( file, group, buffer_size, @@ -677,7 +686,7 @@ SEXP iterate_matrix_10x_hdf5_cpp( std::make_unique(col_names) )); } else if (type == "uint64_t") { - return make_unique_xptr>(open10xFeatureMatrix( + return unique_xptr>(open10xFeatureMatrix( file, group, buffer_size, @@ -685,7 +694,7 @@ SEXP iterate_matrix_10x_hdf5_cpp( std::make_unique(col_names) )); } else if (type == "float") { - return make_unique_xptr>(open10xFeatureMatrix( + return unique_xptr>(open10xFeatureMatrix( file, group, buffer_size, @@ -693,7 +702,7 @@ SEXP iterate_matrix_10x_hdf5_cpp( std::make_unique(col_names) )); } else if (type == "double") { - return make_unique_xptr>(open10xFeatureMatrix( + return unique_xptr>(open10xFeatureMatrix( file, group, buffer_size, @@ -819,11 +828,11 @@ List dims_matrix_anndata_hdf5_cpp(std::string file, std::string group, uint32_t std::string type = getAnnDataMatrixType(file, group); List l; if (type == "uint32_t") { - l = dims_matrix(openAnnDataMatrix(file, group, buffer_size), transpose); + l = dims_matrix(*openAnnDataMatrix(file, group, buffer_size), transpose); } else if (type == "float") { - l = dims_matrix(openAnnDataMatrix(file, group, buffer_size), transpose); + l = dims_matrix(*openAnnDataMatrix(file, group, buffer_size), transpose); } else if (type == "double") { - l = dims_matrix(openAnnDataMatrix(file, group, buffer_size), transpose); + l = dims_matrix(*openAnnDataMatrix(file, group, buffer_size), transpose); } else { throw std::runtime_error("dims_matrix_anndata_hdf5_cpp: Unrecognized matrix type " + type); } @@ -848,7 +857,7 @@ SEXP iterate_matrix_anndata_hdf5_cpp( throw std::runtime_error(ss.str()); } if (type == "uint32_t") { - return make_unique_xptr>(openAnnDataMatrix( + return unique_xptr>(openAnnDataMatrix( file, group, buffer_size, @@ -856,7 +865,7 @@ SEXP iterate_matrix_anndata_hdf5_cpp( std::make_unique(col_names) )); } else if (type == "float") { - return make_unique_xptr>(openAnnDataMatrix( + return unique_xptr>(openAnnDataMatrix( file, group, buffer_size, @@ -864,7 +873,7 @@ SEXP iterate_matrix_anndata_hdf5_cpp( std::make_unique(col_names) )); } else if (type == "double") { - return make_unique_xptr>(openAnnDataMatrix( + return unique_xptr>(openAnnDataMatrix( file, group, buffer_size, diff --git a/r/tests/data/generate_old_anndata.py b/r/tests/data/generate_old_anndata.py index dd4b178d..2f3259c0 100644 --- a/r/tests/data/generate_old_anndata.py +++ b/r/tests/data/generate_old_anndata.py @@ -1,7 +1,8 @@ # The anndata version we want is from July 2019, which is when Python 3.7 was new +# AnnData versions 0.6.22 and 0.7.0 differ from each other and the current format -# Building python 3.8 (new at the time) +# Building python 3.7.17 (new at the time) # wget https://www.python.org/ftp/python/3.7.17/Python-3.7.17.tgz # tar -xzvf Python-3.7.17.tgz # cd Python-3.7.17 @@ -15,7 +16,6 @@ # For other anndata versions: # python -m pip install anndata==0.7.0 "pandas<0.25.0" -# python -m pip install anndata==0.7.8 "pandas<2.0" import anndata as ad @@ -29,7 +29,9 @@ adata = ad.AnnData( X = scipy.sparse.csr_matrix(x), - layers = {"transpose": scipy.sparse.csc_matrix(x)} + layers = {"transpose": scipy.sparse.csc_matrix(x), "dense": x}, + obsm = {"obs_mat": scipy.sparse.csc_matrix(x[:,:2])}, + varm = {"var_mat": x[:2,:].T}, ) adata.write_h5ad(f"mini_mat.anndata-v{ad.__version__}.h5ad") \ No newline at end of file diff --git a/r/tests/data/mini_mat.h5ad b/r/tests/data/mini_mat.anndata-v0.10.9.h5ad similarity index 61% rename from r/tests/data/mini_mat.h5ad rename to r/tests/data/mini_mat.anndata-v0.10.9.h5ad index 9dab8866..6fdcca57 100644 Binary files a/r/tests/data/mini_mat.h5ad and b/r/tests/data/mini_mat.anndata-v0.10.9.h5ad differ diff --git a/r/tests/data/mini_mat.anndata-v0.6.22.h5ad b/r/tests/data/mini_mat.anndata-v0.6.22.h5ad index 2477580d..afdb9402 100644 Binary files a/r/tests/data/mini_mat.anndata-v0.6.22.h5ad and b/r/tests/data/mini_mat.anndata-v0.6.22.h5ad differ diff --git a/r/tests/data/mini_mat.anndata-v0.7.8.h5ad b/r/tests/data/mini_mat.anndata-v0.7.8.h5ad deleted file mode 100644 index 1fa51611..00000000 Binary files a/r/tests/data/mini_mat.anndata-v0.7.8.h5ad and /dev/null differ diff --git a/r/tests/data/mini_mat.anndata-v0.7.h5ad b/r/tests/data/mini_mat.anndata-v0.7.h5ad index 1fa51611..d2c062fa 100644 Binary files a/r/tests/data/mini_mat.anndata-v0.7.h5ad and b/r/tests/data/mini_mat.anndata-v0.7.h5ad differ diff --git a/r/tests/testthat/test-matrix_io.R b/r/tests/testthat/test-matrix_io.R index edba073a..0f3ca40f 100644 --- a/r/tests/testthat/test-matrix_io.R +++ b/r/tests/testthat/test-matrix_io.R @@ -250,8 +250,12 @@ test_that("AnnData read backwards compatibility", { as("dgCMatrix") rownames(ans) <- as.character(0:4) colnames(ans) <- as.character(0:2) + obsm_ans <- ans[1:2,] + rownames(obsm_ans) <- NULL + varm_ans <- t(ans[,1:2]) + rownames(varm_ans) <- NULL - test_files <- c("mini_mat.h5ad", "mini_mat.anndata-v0.6.22.h5ad", "mini_mat.anndata-v0.7.h5ad", "mini_mat.anndata-v0.7.8.h5ad") + test_files <- c("mini_mat.anndata-v0.6.22.h5ad", "mini_mat.anndata-v0.7.h5ad", "mini_mat.anndata-v0.10.9.h5ad") for (f in test_files) { file.copy(file.path("../data", f), file.path(dir, f)) open_matrix_anndata_hdf5(file.path(dir, f)) %>% @@ -260,14 +264,24 @@ test_that("AnnData read backwards compatibility", { open_matrix_anndata_hdf5(file.path(dir, f), group="layers/transpose") %>% as("dgCMatrix") %>% expect_identical(ans) + open_matrix_anndata_hdf5(file.path(dir, f), group="layers/dense") %>% + as("dgCMatrix") %>% + expect_identical(ans) + + open_matrix_anndata_hdf5(file.path(dir, f), group="obsm/obs_mat") %>% + as("dgCMatrix") %>% + expect_identical(obsm_ans) + open_matrix_anndata_hdf5(file.path(dir, f), group="varm/var_mat") %>% + as("dgCMatrix") %>% + expect_identical(varm_ans) } }) test_that("AnnData subset hasn't regressed", { dir <- withr::local_tempdir() # Make a copy since apparently reading the test hdf5 file causes modifications that git detects - file.copy("../data/mini_mat.h5ad", file.path(dir, "mini_mat.h5ad")) - x <- open_matrix_anndata_hdf5(file.path(dir, "mini_mat.h5ad")) %>% + file.copy("../data/mini_mat.anndata-v0.10.9.h5ad", file.path(dir, "mini_mat.anndata-v0.10.9.h5ad")) + x <- open_matrix_anndata_hdf5(file.path(dir, "mini_mat.anndata-v0.10.9.h5ad")) %>% .[1:nrow(.),1:ncol(.)] s <- matrix_stats(x, row_stats="mean", col_stats="mean") @@ -317,10 +331,10 @@ test_that("AnnData types round-trip", { test_that("AnnData and 10x row/col rename works", { dir <- withr::local_tempdir() # Make a copy since apparently reading the test hdf5 file causes modifications that git detects - file.copy("../data/mini_mat.h5ad", file.path(dir, "mini_mat.h5ad")) + file.copy("../data/mini_mat.anndata-v0.10.9.h5ad", file.path(dir, "mini_mat.anndata-v0.10.9.h5ad")) # Test change row+col names on hdf5 - x <- open_matrix_anndata_hdf5(file.path(dir, "mini_mat.h5ad")) + x <- open_matrix_anndata_hdf5(file.path(dir, "mini_mat.anndata-v0.10.9.h5ad")) orig_colnames <- colnames(x) orig_rownames <- rownames(x) @@ -338,7 +352,7 @@ test_that("AnnData and 10x row/col rename works", { expect_identical(rownames(x2_t), colnames(x)) # Test change row+col names on 10x - x_10x <- open_matrix_anndata_hdf5(file.path(dir, "mini_mat.h5ad")) %>% + x_10x <- open_matrix_anndata_hdf5(file.path(dir, "mini_mat.anndata-v0.10.9.h5ad")) %>% convert_matrix_type("uint32_t") %>% write_matrix_10x_hdf5(file.path(dir, "mini_mat.h5")) rownames(x_10x) <- paste0("2row", rownames(x_10x)) @@ -358,9 +372,9 @@ test_that("AnnData and 10x row/col rename works", { test_that("AnnData write to dir matrix works (#57 regression test)", { dir <- withr::local_tempdir() # Make a copy since apparently reading the test hdf5 file causes modifications that git detects - file.copy("../data/mini_mat.h5ad", file.path(dir, "mini_mat.h5ad")) - x <- open_matrix_anndata_hdf5(file.path(dir, "mini_mat.h5ad")) - xt <- open_matrix_anndata_hdf5(file.path(dir, "mini_mat.h5ad"), group="layers/transpose") + file.copy("../data/mini_mat.anndata-v0.10.9.h5ad", file.path(dir, "mini_mat.anndata-v0.10.9.h5ad")) + x <- open_matrix_anndata_hdf5(file.path(dir, "mini_mat.anndata-v0.10.9.h5ad")) + xt <- open_matrix_anndata_hdf5(file.path(dir, "mini_mat.anndata-v0.10.9.h5ad"), group="layers/transpose") y <- write_matrix_dir(x, file.path(dir, "mini_mat")) yt <- write_matrix_dir(xt, file.path(dir, "mini_mat_t"))