broadinstitute · sjfleming · Apr 11, 2024 · Apr 5, 2024 · Apr 5, 2024 · Apr 5, 2024
diff --git a/.github/workflows/miniwdl_check.yml b/.github/workflows/miniwdl_check.yml
@@ -1,7 +1,12 @@
 name: 'validate WDL'
-on: [pull_request]
+
+on: 
+  pull_request:
+    branches: [ master, dev ]
+
 env:
   MINIWDL_VERSION: 1.8.0
+
 jobs:
   miniwdl-check:
     runs-on: ubuntu-latest

diff --git a/.github/workflows/run_packaging_check.yml b/.github/workflows/run_packaging_check.yml
@@ -2,7 +2,9 @@
 
 name: 'packaging'
 
-on: pull_request
+on: 
+  pull_request:
+    branches: [ master, dev ]
 
 jobs:
   build:

diff --git a/.github/workflows/run_pytest.yml b/.github/workflows/run_pytest.yml
@@ -2,7 +2,9 @@
 
 name: 'pytest'
 
-on: pull_request
+on: 
+  pull_request:
+    branches: [ master, dev ]
 
 jobs:
   build:

diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,4 @@ dist/
 *.csv
 *.npz
 *.tar.gz
+data/
diff --git a/cellbender/remove_background/estimation.py b/cellbender/remove_background/estimation.py
@@ -21,6 +21,10 @@
 
 logger = logging.getLogger('cellbender')
 
+N_CELLS_DATATYPE = np.int32
+N_GENES_DATATYPE = np.int32
+COUNT_DATATYPE = np.int32
+
 
 class EstimationMethod(ABC):
     """Base class for estimation of noise counts, given a posterior."""
@@ -52,7 +56,7 @@ def _estimation_array_to_csr(self,
                                  data: np.ndarray,
                                  m: np.ndarray,
                                  noise_offsets: Optional[Dict[int, int]],
-                                 dtype=np.int64) -> sp.csr_matrix:
+                                 dtype=COUNT_DATATYPE) -> sp.csr_matrix:
         """Say you have point estimates for each count matrix element (data) and
         you have the 'm'-indices for each value (m). This returns a CSR matrix
         that has the shape of the count matrix, where duplicate entries have
@@ -218,7 +222,7 @@ def _estimation_array_to_csr(index_converter,
                              data: np.ndarray,
                              m: np.ndarray,
                              noise_offsets: Optional[Dict[int, int]],
-                             dtype=np.int) -> sp.csr_matrix:
+                             dtype=COUNT_DATATYPE) -> sp.csr_matrix:
     """Say you have point estimates for each count matrix element (data) and
     you have the 'm'-indices for each value (m). This returns a CSR matrix
     that has the shape of the count matrix, where duplicate entries have
@@ -238,7 +242,7 @@ def _estimation_array_to_csr(index_converter,
     row, col = index_converter.get_ng_indices(m_inds=m)
     if noise_offsets is not None:
         data = data + np.array([noise_offsets.get(i, 0) for i in m])
-    coo = sp.coo_matrix((data.astype(dtype), (row.astype(np.uint64), col.astype(np.uint8))),
+    coo = sp.coo_matrix((data.astype(dtype), (row.astype(N_CELLS_DATATYPE), col.astype(N_GENES_DATATYPE))),
                         shape=index_converter.matrix_shape, dtype=dtype)
     coo.sum_duplicates()
     return coo.tocsr()

diff --git a/cellbender/remove_background/tests/test_estimation.py b/cellbender/remove_background/tests/test_estimation.py
@@ -6,9 +6,10 @@
 import torch
 
 from cellbender.remove_background.estimation import Mean, MAP, \
-    SingleSample, ThresholdCDF, MultipleChoiceKnapsack, pandas_grouped_apply
+    SingleSample, ThresholdCDF, MultipleChoiceKnapsack, pandas_grouped_apply, _estimation_array_to_csr, COUNT_DATATYPE
 from cellbender.remove_background.posterior import IndexConverter, \
     dense_to_sparse_op_torch, log_prob_sparse_to_dense
+from cellbender.remove_background.tests.conftest import sparse_matrix_equal
 
 from typing import Dict, Union
 
@@ -92,11 +93,12 @@ def test_mean_massive_m(log_prob_coo):
     new_shape = (coo.shape[0] + greater_than_max_int32, coo.shape[1])
     new_coo = sp.coo_matrix((coo.data, (new_row, coo.col)),
                             shape=new_shape)
+    print(new_coo)
     offset_dict = {k + greater_than_max_int32: v for k, v in log_prob_coo['offsets'].items()}
 
     # this is just a shim
-    converter = IndexConverter(total_n_cells=2,
-                               total_n_genes=new_coo.shape[0])
+    converter = IndexConverter(total_n_cells=new_coo.shape[0],
+                               total_n_genes=new_coo.shape[1])
 
     # set up and estimate
     estimator = Mean(index_converter=converter)
@@ -379,3 +381,28 @@ def test_parallel_pandas_grouped_apply(fun):
 
     np.testing.assert_array_equal(reg['m'], parallel['m'])
     np.testing.assert_array_equal(reg['result'], parallel['result'])
+
+
+def test_estimation_array_to_csr():
+
+    larger_than_uint16 = 2**16 + 1
+
+    converter = IndexConverter(total_n_cells=larger_than_uint16,
+                               total_n_genes=larger_than_uint16)
+    m = larger_than_uint16 + np.arange(-10, 10)
+    data = np.random.rand(len(m)) * -10
+    noise_offsets = None
+
+    output_csr = _estimation_array_to_csr(index_converter=converter, data=data, m=m, noise_offsets=noise_offsets, dtype=COUNT_DATATYPE)
+
+    # reimplementation here with totally permissive datatypes
+    cell_and_gene_dtype = np.float64
+    row, col = converter.get_ng_indices(m_inds=m)
+    if noise_offsets is not None:
+        data = data + np.array([noise_offsets.get(i, 0) for i in m])
+    coo = sp.coo_matrix((data.astype(COUNT_DATATYPE), (row.astype(cell_and_gene_dtype), col.astype(cell_and_gene_dtype))),
+                        shape=converter.matrix_shape, dtype=COUNT_DATATYPE)
+    coo.sum_duplicates()
+    truth_csr = coo.tocsr()
+
+    assert sparse_matrix_equal(output_csr, truth_csr)
diff --git a/cellbender/remove_background/tests/test_integration.py b/cellbender/remove_background/tests/test_integration.py
@@ -50,3 +50,6 @@ def test_full_run(tmpdir_factory, h5_v3_file, cuda):
     adata_cell_barcodes = adata.obs_names[adata.obs['cell_probability'] > consts.CELL_PROB_CUTOFF]
     assert set(cell_barcodes) == set(adata_cell_barcodes), \
         'Cell barcodes in h5 are different from those in CSV file'
+
+    # ensure there are no negative count matrix entries in the output
+    assert np.all(adata.X.data >= 0), 'Negative count matrix entries in output'
diff --git a/requirements.txt b/requirements.txt
@@ -12,4 +12,5 @@ jupyter
 jupyter_contrib_nbextensions
 notebook<7.0.0
 nbconvert<7.0.0
+lxml_html_clean
 psutil
-Original file line number
+Diff line change
@@ Expand Up / @@ -18,3 +18,4 @@ dist/ @@
     *.csv
     *.npz
     *.tar.gz
+    data/