Fix major bug in v0.3.1: negative counts (#347)

* issue #348 lxml_html_clean dependency * Fix #306: noise count integer overflow * Introduce --force-use-checkpoint for redoing v0.3.1 runs * Add a warning to the README about v0.3.1
broadinstitute · Apr 11, 2024 · 97c1d40 · 97c1d40
1 parent a4da38a
commit 97c1d40
Show file tree

Hide file tree

Showing 14 changed files with 126 additions and 26 deletions.
diff --git a/.github/workflows/miniwdl_check.yml b/.github/workflows/miniwdl_check.yml
@@ -1,7 +1,12 @@
 name: 'validate WDL'
-on: [pull_request]
+
+on: 
+  pull_request:
+    branches: [ master, dev ]
+
 env:
   MINIWDL_VERSION: 1.8.0
+
 jobs:
   miniwdl-check:
     runs-on: ubuntu-latest

diff --git a/.github/workflows/run_packaging_check.yml b/.github/workflows/run_packaging_check.yml
@@ -2,7 +2,9 @@
 
 name: 'packaging'
 
-on: pull_request
+on: 
+  pull_request:
+    branches: [ master, dev ]
 
 jobs:
   build:

diff --git a/.github/workflows/run_pytest.yml b/.github/workflows/run_pytest.yml
@@ -2,7 +2,9 @@
 
 name: 'pytest'
 
-on: pull_request
+on: 
+  pull_request:
+    branches: [ master, dev ]
 
 jobs:
   build:

diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,4 @@ dist/
 *.csv
 *.npz
 *.tar.gz
+data/
diff --git a/README.rst b/README.rst
@@ -36,6 +36,32 @@ The current release contains the following modules. More modules will be added i
 
 Please refer to `the documentation <https://cellbender.readthedocs.io/en/latest/>`_ for a quick start tutorial.
 
+   WARNING:
+
+   The release tagged v0.3.1 included a bug which caused output count matrices to be incorrect. `The bug 
+   <https://github.com/broadinstitute/CellBender/blame/e2fb5977cb187cb4b12172c9f77ed556bca92cb0/cellbender/remove_background/estimation.py#L241>`_, 
+   introduced in `#303 <https://github.com/broadinstitute/CellBender/pull/303>`_, compromised output denoised count matrices 
+   (due to an integer overflow) and would often show up as negative entries in the output count matrices. The bug also existed on 
+   the master branch until `#347 <https://github.com/broadinstitute/CellBender/pull/347>`_.
+
+   For now, we recommend using either v0.3.0 or the master branch (after #347) until v0.3.2 is released, and then using v0.3.2.
+
+   Outputs generated with v0.3.1 (or the master branch between #303 and #347) can be salvaged by making use of the 
+   checkpoint file, which is not compromised. The following command will re-run the (inexpensive, CPU-only) 
+   estimation of the output count matrix using the saved posterior in the checkpoint file. Note the use of the new 
+   input argument ``--force-use-checkpoint`` which will allow use of a checkpoint file produced by a different CellBender version:
+
+   .. code-block:: console
+
+     (cellbender) $ cellbender remove-background \
+           --input my_raw_count_matrix_file.h5 \
+           --output my_cellbender_output_file.h5 \
+           --checkpoint path/to/ckpt.tar.gz \
+           --force-use-checkpoint
+
+   where ``path/to/ckpt.tar.gz`` is the path to the checkpoint file generated by the original run. Ensure that you pair up the right 
+   ``--input`` with the right ``--checkpoint``.
+
 Installation and Usage
 ----------------------
 

diff --git a/cellbender/remove_background/argparser.py b/cellbender/remove_background/argparser.py
@@ -48,10 +48,19 @@ def add_subparser_args(subparsers: argparse) -> argparse:
     subparser.add_argument("--checkpoint", nargs=None, type=str,
                            dest='input_checkpoint_tarball',
                            required=False, default=consts.CHECKPOINT_FILE_NAME,
-                           help="Checkpoint tarball produced by the same version "
+                           help="Checkpoint tarball produced by v0.3.0+ "
                                 "of CellBender remove-background.  If present, "
                                 "and the workflow hashes match, training will "
                                 "restart from this checkpoint.")
+    subparser.add_argument("--force-use-checkpoint",
+                           dest='force_use_checkpoint', action="store_true",
+                           help="Normally, checkpoints can only be used if the CellBender "
+                           "code and certain input args match exactly. This flag allows you "
+                           "to bypass this requirement. An example use would be to create a new output "
+                           "using a checkpoint from a run of v0.3.1, a redacted version with "
+                           "faulty output count matrices. If you use this flag, "
+                           "ensure that the input file and the checkpoint match, because "
+                           "CellBender will not check.")
     subparser.add_argument("--expected-cells", nargs=None, type=int,
                            default=None,
                            dest="expected_cell_count",

diff --git a/cellbender/remove_background/checkpoint.py b/cellbender/remove_background/checkpoint.py
@@ -154,7 +154,8 @@ def save_checkpoint(filebase: str,
 
 def load_checkpoint(filebase: Optional[str],
                     tarball_name: str = consts.CHECKPOINT_FILE_NAME,
-                    force_device: Optional[str] = None)\
+                    force_device: Optional[str] = None,
+                    force_use_checkpoint: bool = False)\
         -> Dict[str, Union['RemoveBackgroundPyroModel', pyro.optim.PyroOptim, DataLoader, bool]]:
     """Load checkpoint and prepare a RemoveBackgroundPyroModel and optimizer."""
 
@@ -163,6 +164,7 @@ def load_checkpoint(filebase: Optional[str],
         tarball_name=tarball_name,
         to_load=['model', 'optim', 'param_store', 'dataloader', 'args', 'random_state'],
         force_device=force_device,
+        force_use_checkpoint=force_use_checkpoint,
     )
     out.update({'loaded': True})
     logger.info(f'Loaded partially-trained checkpoint from {tarball_name}')
@@ -172,7 +174,8 @@ def load_checkpoint(filebase: Optional[str],
 def load_from_checkpoint(filebase: Optional[str],
                          tarball_name: str = consts.CHECKPOINT_FILE_NAME,
                          to_load: List[str] = ['model'],
-                         force_device: Optional[str] = None) -> Dict:
+                         force_device: Optional[str] = None,
+                         force_use_checkpoint: bool = False) -> Dict:
     """Load specific files from a checkpoint tarball."""
 
     load_kwargs = {}
@@ -192,19 +195,24 @@ def load_from_checkpoint(filebase: Optional[str],
         else:
             # no tarball loaded, so do not continue trying to load files
             raise FileNotFoundError
-
-        # See if files have a hash matching input filebase.
-        if filebase is not None:
+
+        # If posterior is present, do not require run hash to match: will pick up 
+        # after training and run estimation from existing posterior.
+        # This smoothly allows re-runs (including for problematic v0.3.1)
+        logger.debug(f'force_use_checkpoint: {force_use_checkpoint}')
+        if force_use_checkpoint or (filebase is None):
+            filebase = (glob.glob(os.path.join(tmp_dir, '*_model.torch'))[0]
+                        .replace('_model.torch', ''))
+            logger.debug(f'Accepting any file hash, so loading {filebase}*')
+
+        else:
+            # See if files have a hash matching input filebase.
             basename = os.path.basename(filebase)
             filebase = os.path.join(tmp_dir, basename)
             logger.debug(f'Looking for files with base name matching {filebase}*')
             if not os.path.exists(filebase + '_model.torch'):
                 logger.info('Workflow hash does not match that of checkpoint.')
-                raise ValueError('Workflow hash does not match that of checkpoint.')
-        else:
-            filebase = (glob.glob(os.path.join(tmp_dir, '*_model.torch'))[0]
-                        .replace('_model.torch', ''))
-            logger.debug(f'Accepting any file hash, so loading {filebase}*')
+                raise ValueError('Workflow hash does not match that of checkpoint.')            
 
         out = {}
 
@@ -265,9 +273,10 @@ def load_from_checkpoint(filebase: Optional[str],
     return out
 
 
-def attempt_load_checkpoint(filebase: str,
+def attempt_load_checkpoint(filebase: Optional[str],
                             tarball_name: str = consts.CHECKPOINT_FILE_NAME,
-                            force_device: Optional[str] = None)\
+                            force_device: Optional[str] = None,
+                            force_use_checkpoint: bool = False)\
         -> Dict[str, Union['RemoveBackgroundPyroModel', pyro.optim.PyroOptim, DataLoader, bool]]:
     """Load checkpoint and prepare a RemoveBackgroundPyroModel and optimizer,
     or return the inputs if loading fails."""
@@ -276,7 +285,8 @@ def attempt_load_checkpoint(filebase: str,
         logger.debug('Attempting to load checkpoint from ' + tarball_name)
         return load_checkpoint(filebase=filebase,
                                tarball_name=tarball_name,
-                               force_device=force_device)
+                               force_device=force_device,
+                               force_use_checkpoint=force_use_checkpoint)
 
     except FileNotFoundError:
         logger.debug('No tarball found')

diff --git a/cellbender/remove_background/estimation.py b/cellbender/remove_background/estimation.py
@@ -21,6 +21,10 @@
 
 logger = logging.getLogger('cellbender')
 
+N_CELLS_DATATYPE = np.int32
+N_GENES_DATATYPE = np.int32
+COUNT_DATATYPE = np.int32
+
 
 class EstimationMethod(ABC):
     """Base class for estimation of noise counts, given a posterior."""
@@ -52,7 +56,7 @@ def _estimation_array_to_csr(self,
                                  data: np.ndarray,
                                  m: np.ndarray,
                                  noise_offsets: Optional[Dict[int, int]],
-                                 dtype=np.int64) -> sp.csr_matrix:
+                                 dtype=COUNT_DATATYPE) -> sp.csr_matrix:
         """Say you have point estimates for each count matrix element (data) and
         you have the 'm'-indices for each value (m). This returns a CSR matrix
         that has the shape of the count matrix, where duplicate entries have
@@ -218,7 +222,7 @@ def _estimation_array_to_csr(index_converter,
                              data: np.ndarray,
                              m: np.ndarray,
                              noise_offsets: Optional[Dict[int, int]],
-                             dtype=np.int) -> sp.csr_matrix:
+                             dtype=COUNT_DATATYPE) -> sp.csr_matrix:
     """Say you have point estimates for each count matrix element (data) and
     you have the 'm'-indices for each value (m). This returns a CSR matrix
     that has the shape of the count matrix, where duplicate entries have
@@ -238,7 +242,7 @@ def _estimation_array_to_csr(index_converter,
     row, col = index_converter.get_ng_indices(m_inds=m)
     if noise_offsets is not None:
         data = data + np.array([noise_offsets.get(i, 0) for i in m])
-    coo = sp.coo_matrix((data.astype(dtype), (row.astype(np.uint64), col.astype(np.uint8))),
+    coo = sp.coo_matrix((data.astype(dtype), (row.astype(N_CELLS_DATATYPE), col.astype(N_GENES_DATATYPE))),
                         shape=index_converter.matrix_shape, dtype=dtype)
     coo.sum_duplicates()
     return coo.tocsr()

diff --git a/cellbender/remove_background/posterior.py b/cellbender/remove_background/posterior.py
@@ -107,13 +107,15 @@ def _do_posterior_regularization(posterior: Posterior):
     try:
         ckpt_posterior = load_from_checkpoint(tarball_name=args.input_checkpoint_tarball,
                                               filebase=args.checkpoint_filename,
-                                              to_load=['posterior'])
+                                              to_load=['posterior'],
+                                              force_use_checkpoint=args.force_use_checkpoint)
     except ValueError:
         # input checkpoint tarball was not a match for this workflow
         # but we still may have saved a new tarball
         ckpt_posterior = load_from_checkpoint(tarball_name=consts.CHECKPOINT_FILE_NAME,
                                               filebase=args.checkpoint_filename,
-                                              to_load=['posterior'])
+                                              to_load=['posterior'],
+                                              force_use_checkpoint=args.force_use_checkpoint)
     if os.path.exists(ckpt_posterior.get('posterior_file', 'does_not_exist')):
         # Load posterior if it was saved in the checkpoint.
         posterior.load(file=ckpt_posterior['posterior_file'])

diff --git a/cellbender/remove_background/run.py b/cellbender/remove_background/run.py
@@ -307,6 +307,9 @@ def compute_output_denoised_counts_reports_metrics(posterior: Posterior,
             posterior.latents_map['p'],
         )
 
+        # Failsafe to ensure no negative counts.
+        assert np.all(denoised_counts.data >= 0), 'Negative count matrix entries in output'
+
         # TODO: correct cell probabilities so that any zero-count droplet becomes "empty"
 
         # Save denoised count matrix.
@@ -627,7 +630,8 @@ def run_inference(dataset_obj: SingleCellRNACountsDataset,
     # Attempt to load from a previously-saved checkpoint.
     ckpt = attempt_load_checkpoint(filebase=checkpoint_filename,
                                    tarball_name=args.input_checkpoint_tarball,
-                                   force_device='cuda:0' if args.use_cuda else 'cpu')
+                                   force_device='cuda:0' if args.use_cuda else 'cpu',
+                                   force_use_checkpoint=args.force_use_checkpoint)
     ckpt_loaded = ckpt['loaded']  # True if a checkpoint was loaded successfully
 
     if ckpt_loaded:

diff --git a/cellbender/remove_background/tests/test_checkpoint.py b/cellbender/remove_background/tests/test_checkpoint.py
@@ -414,6 +414,7 @@ def test_save_and_load_cellbender_checkpoint(tmpdir_factory, cuda, scheduler):
     args.constant_learning_rate = not scheduler
     args.debug = False
     args.input_checkpoint_tarball = 'none'
+    args.force_use_checkpoint = False
 
     create_random_state_blank_slate(0)
     pyro.clear_param_store()

diff --git a/cellbender/remove_background/tests/test_estimation.py b/cellbender/remove_background/tests/test_estimation.py
@@ -6,9 +6,10 @@
 import torch
 
 from cellbender.remove_background.estimation import Mean, MAP, \
-    SingleSample, ThresholdCDF, MultipleChoiceKnapsack, pandas_grouped_apply
+    SingleSample, ThresholdCDF, MultipleChoiceKnapsack, pandas_grouped_apply, _estimation_array_to_csr, COUNT_DATATYPE
 from cellbender.remove_background.posterior import IndexConverter, \
     dense_to_sparse_op_torch, log_prob_sparse_to_dense
+from cellbender.remove_background.tests.conftest import sparse_matrix_equal
 
 from typing import Dict, Union
 
@@ -92,11 +93,15 @@ def test_mean_massive_m(log_prob_coo):
     new_shape = (coo.shape[0] + greater_than_max_int32, coo.shape[1])
     new_coo = sp.coo_matrix((coo.data, (new_row, coo.col)),
                             shape=new_shape)
+    print(f'original COO shape: {coo.shape}')
+    print(f'new COO shape: {new_coo.shape}')
+    print(f'new row minimum value: {new_coo.row.min()}')
+    print(f'new row maximum value: {new_coo.row.max()}')
     offset_dict = {k + greater_than_max_int32: v for k, v in log_prob_coo['offsets'].items()}
 
     # this is just a shim
-    converter = IndexConverter(total_n_cells=2,
-                               total_n_genes=new_coo.shape[0])
+    converter = IndexConverter(total_n_cells=new_coo.shape[0],
+                               total_n_genes=new_coo.shape[1])
 
     # set up and estimate
     estimator = Mean(index_converter=converter)
@@ -379,3 +384,28 @@ def test_parallel_pandas_grouped_apply(fun):
 
     np.testing.assert_array_equal(reg['m'], parallel['m'])
     np.testing.assert_array_equal(reg['result'], parallel['result'])
+
+
+def test_estimation_array_to_csr():
+
+    larger_than_uint16 = 2**16 + 1
+
+    converter = IndexConverter(total_n_cells=larger_than_uint16,
+                               total_n_genes=larger_than_uint16)
+    m = larger_than_uint16 + np.arange(-10, 10)
+    data = np.random.rand(len(m)) * -10
+    noise_offsets = None
+
+    output_csr = _estimation_array_to_csr(index_converter=converter, data=data, m=m, noise_offsets=noise_offsets, dtype=COUNT_DATATYPE)
+
+    # reimplementation here with totally permissive datatypes
+    cell_and_gene_dtype = np.float64
+    row, col = converter.get_ng_indices(m_inds=m)
+    if noise_offsets is not None:
+        data = data + np.array([noise_offsets.get(i, 0) for i in m])
+    coo = sp.coo_matrix((data.astype(COUNT_DATATYPE), (row.astype(cell_and_gene_dtype), col.astype(cell_and_gene_dtype))),
+                        shape=converter.matrix_shape, dtype=COUNT_DATATYPE)
+    coo.sum_duplicates()
+    truth_csr = coo.tocsr()
+
+    assert sparse_matrix_equal(output_csr, truth_csr)
diff --git a/cellbender/remove_background/tests/test_integration.py b/cellbender/remove_background/tests/test_integration.py
@@ -50,3 +50,6 @@ def test_full_run(tmpdir_factory, h5_v3_file, cuda):
     adata_cell_barcodes = adata.obs_names[adata.obs['cell_probability'] > consts.CELL_PROB_CUTOFF]
     assert set(cell_barcodes) == set(adata_cell_barcodes), \
         'Cell barcodes in h5 are different from those in CSV file'
+
+    # ensure there are no negative count matrix entries in the output
+    assert np.all(adata.X.data >= 0), 'Negative count matrix entries in output'
diff --git a/requirements.txt b/requirements.txt
@@ -12,4 +12,5 @@ jupyter
 jupyter_contrib_nbextensions
 notebook<7.0.0
 nbconvert<7.0.0
+lxml_html_clean
 psutil
-Original file line number
+Diff line change
@@ Expand Up / @@ -18,3 +18,4 @@ dist/ @@
     *.csv
     *.npz
     *.tar.gz
+    data/