diff --git a/.github/workflows/run_pytest.yml b/.github/workflows/run_pytest.yml index 37e652b..7c8c662 100644 --- a/.github/workflows/run_pytest.yml +++ b/.github/workflows/run_pytest.yml @@ -1,4 +1,4 @@ -# Run cellbender's tests +# Run cellbender test suite name: 'pytest' @@ -7,11 +7,13 @@ on: pull_request jobs: build: - runs-on: 'ubuntu-latest' strategy: matrix: + os: ['ubuntu-latest', 'windows-latest'] python-version: ['3.7'] + runs-on: ${{ matrix.os }} + steps: - name: 'Checkout repo' uses: actions/checkout@v3 diff --git a/cellbender/remove_background/checkpoint.py b/cellbender/remove_background/checkpoint.py index 5b0903d..206737b 100644 --- a/cellbender/remove_background/checkpoint.py +++ b/cellbender/remove_background/checkpoint.py @@ -297,7 +297,7 @@ def make_tarball(files: List[str], tarball_name: str) -> bool: for file in files: # without arcname, unpacking results in unpredictable file locations! tar.add(file, arcname=os.path.basename(file)) - os.rename(tarball_name + '.tmp', tarball_name) + os.replace(tarball_name + '.tmp', tarball_name) return True diff --git a/cellbender/remove_background/estimation.py b/cellbender/remove_background/estimation.py index 371a7fa..b4866d2 100644 --- a/cellbender/remove_background/estimation.py +++ b/cellbender/remove_background/estimation.py @@ -218,7 +218,7 @@ def _estimation_array_to_csr(index_converter, data: np.ndarray, m: np.ndarray, noise_offsets: Optional[Dict[int, int]], - dtype=np.int64) -> sp.csr_matrix: + dtype=np.int) -> sp.csr_matrix: """Say you have point estimates for each count matrix element (data) and you have the 'm'-indices for each value (m). This returns a CSR matrix that has the shape of the count matrix, where duplicate entries have @@ -229,7 +229,7 @@ def _estimation_array_to_csr(index_converter, a flat format, indexed by 'm'. m: Array of the same length as data, where each entry is an m-index. noise_offsets: Noise count offset values keyed by 'm'. - dtype: Data type for sparse matrix. Int32 is too small for 'm' indices. + dtype: Data type for values of sparse matrix Results: noise_csr: Noise point estimate, as a CSR sparse matrix. @@ -238,7 +238,7 @@ def _estimation_array_to_csr(index_converter, row, col = index_converter.get_ng_indices(m_inds=m) if noise_offsets is not None: data = data + np.array([noise_offsets.get(i, 0) for i in m]) - coo = sp.coo_matrix((data.astype(dtype), (row.astype(dtype), col.astype(dtype))), + coo = sp.coo_matrix((data.astype(dtype), (row.astype(np.uint64), col.astype(np.uint8))), shape=index_converter.matrix_shape, dtype=dtype) coo.sum_duplicates() return coo.tocsr() @@ -785,7 +785,7 @@ def apply_function_dense_chunks(noise_log_prob_coo: sp.coo_matrix, """ array_length = len(np.unique(noise_log_prob_coo.row)) - m = np.zeros(array_length) + m = np.zeros(array_length, dtype=np.uint64) out = np.zeros(array_length) a = 0 @@ -804,7 +804,7 @@ def apply_function_dense_chunks(noise_log_prob_coo: sp.coo_matrix, out[a:(a + len_s)] = s.detach().cpu().numpy() a = a + len_s - return {'m': m.astype(int), 'result': out} + return {'m': m, 'result': out} def pandas_grouped_apply(coo: sp.coo_matrix, diff --git a/cellbender/remove_background/posterior.py b/cellbender/remove_background/posterior.py index 6aaeefa..c421713 100644 --- a/cellbender/remove_background/posterior.py +++ b/cellbender/remove_background/posterior.py @@ -555,7 +555,7 @@ def _get_cell_noise_count_posterior_coo( # Put the counts into a sparse csr_matrix. self._noise_count_posterior_coo = sp.coo_matrix( (log_probs, (m, c)), - shape=[np.prod(self.count_matrix_shape), n_counts_max], + shape=[np.prod(self.count_matrix_shape, dtype=np.uint64), n_counts_max], ) self._noise_count_posterior_coo_offsets = nonzero_noise_offset_dict return self._noise_count_posterior_coo @@ -1528,7 +1528,7 @@ def get_m_indices(self, cell_inds: np.ndarray, gene_inds: np.ndarray) -> np.ndar if not ((gene_inds >= 0) & (gene_inds < self.total_n_genes)).all(): raise ValueError(f'Requested gene_inds out of range: ' f'{gene_inds[(gene_inds < 0) | (gene_inds >= self.total_n_genes)]}') - return cell_inds * self.total_n_genes + gene_inds + return cell_inds.astype(np.uint64) * self.total_n_genes + gene_inds.astype(np.uint64) def get_ng_indices(self, m_inds: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Given a list of 'm' index values, return two arrays: cell index values diff --git a/cellbender/remove_background/tests/conftest.py b/cellbender/remove_background/tests/conftest.py index 8bafa2d..6a43010 100644 --- a/cellbender/remove_background/tests/conftest.py +++ b/cellbender/remove_background/tests/conftest.py @@ -15,12 +15,32 @@ USE_CUDA = torch.cuda.is_available() -def sparse_matrix_equal(mat1: sp.csc_matrix, - mat2: sp.csc_matrix): +def sparse_matrix_equal(mat1, mat2): """Fast assertion that sparse matrices are equal""" + if type(mat1) == sp.coo_matrix: + return coo_equal(mat1, mat2) + elif (type(mat1) == sp.csc_matrix) or (type(mat1) == sp.csr_matrix): + return csc_or_csr_equal(mat1, mat2) + else: + raise ValueError(f"Error with test functions: sparse_matrix_equal() was called with a {type(mat1)}") + + +def csc_or_csr_equal(mat1: sp.csc_matrix, + mat2: sp.csc_matrix): + """Fast assertion that CSC sparse matrices are equal""" return (mat1 != mat2).sum() == 0 +def coo_equal(mat1: sp.csc_matrix, + mat2: sp.csc_matrix): + """Fast assertion that COO sparse matrices are equal""" + return ( + ((mat1.data != mat2.data).sum() == 0) + and ((mat1.row != mat2.row).sum() == 0) + and ((mat1.col != mat2.col).sum() == 0) + ) + + def tensors_equal(a: torch.Tensor, b: torch.Tensor): """Assertion that torch tensors are equal for each element""" diff --git a/cellbender/remove_background/tests/test_estimation.py b/cellbender/remove_background/tests/test_estimation.py index 528ab67..c1cb931 100644 --- a/cellbender/remove_background/tests/test_estimation.py +++ b/cellbender/remove_background/tests/test_estimation.py @@ -81,6 +81,29 @@ def log_prob_coo(request, log_prob_coo_base) \ raise ValueError(f'Test writing error: requested "{request.param}" log_prob_coo') +def test_mean_massive_m(log_prob_coo): + """Sets up a posterior COO with massive m values that are > max(int32). + Will trigger github issue #252 if a bug exists, no assertion necessary. + """ + + coo = log_prob_coo['coo'] + greater_than_max_int32 = 2200000000 + new_row = coo.row.astype(np.int64) + greater_than_max_int32 + new_shape = (coo.shape[0] + greater_than_max_int32, coo.shape[1]) + new_coo = sp.coo_matrix((coo.data, (new_row, coo.col)), + shape=new_shape) + offset_dict = {k + greater_than_max_int32: v for k, v in log_prob_coo['offsets'].items()} + + # this is just a shim + converter = IndexConverter(total_n_cells=2, + total_n_genes=new_coo.shape[0]) + + # set up and estimate + estimator = Mean(index_converter=converter) + noise_csr = estimator.estimate_noise(noise_log_prob_coo=new_coo, + noise_offsets=offset_dict) + + @pytest.fixture(scope='module', params=['exact', 'filtered', 'unsorted']) def mckp_log_prob_coo(request, log_prob_coo_base) \ -> Dict[str, Union[sp.coo_matrix, np.ndarray, Dict[int, int]]]: diff --git a/cellbender/remove_background/tests/test_posterior.py b/cellbender/remove_background/tests/test_posterior.py index e5382b3..04b766e 100644 --- a/cellbender/remove_background/tests/test_posterior.py +++ b/cellbender/remove_background/tests/test_posterior.py @@ -368,23 +368,35 @@ def test_compute_mean_target_removal_as_function(log_prob_coo, fpr, per_gene, cu @pytest.mark.parametrize('blank_noise_offsets', [False, True], ids=['', 'no_noise_offsets']) -def test_save_and_load(tmpdir_factory, blank_noise_offsets): +@pytest.mark.parametrize('m', [1000, 2200000000], ids=['small', 'big']) +def test_save_and_load(tmpdir_factory, blank_noise_offsets, m): """Test that a round trip through save and load gives the same thing""" tmp_dir = tmpdir_factory.mktemp('posterior') filename = tmp_dir.join('posterior.h5') - m = 1000 n = 20 + num_nonzeros = 1000 posterior = Posterior(dataset_obj=None, vi_model=None) # blank - posterior_coo = sp.random(m, n, density=0.1, format='coo', dtype=float) - posterior_coo2 = sp.random(m, n, density=0.08, format='coo', dtype=float) + # old way that cannot handle large m + # posterior_coo = sp.random(m, n, density=0.1, format='coo', dtype=float) + # posterior_coo2 = sp.random(m, n, density=0.08, format='coo', dtype=float) + + m_array = np.random.randint(low=0, high=m, size=num_nonzeros - 1, dtype=np.uint64) + m_array = np.concatenate([m_array, np.array(m - 1, dtype=np.uint64)], axis=None) + n_array = np.random.randint(low=0, high=n, size=num_nonzeros) + val_array = np.random.rand(num_nonzeros) * -10 + val_array2 = np.random.rand(num_nonzeros) * -5 + + posterior_coo = sp.coo_matrix((val_array, (m_array, n_array)), shape=(m, n)) + posterior_coo2 = sp.coo_matrix((val_array2, (m_array, n_array)), shape=(m, n)) + if blank_noise_offsets: noise_offsets = {} else: - noise_offsets = dict(zip(np.random.randint(low=0, high=(m - 1), size=10), + noise_offsets = dict(zip(np.random.randint(low=0, high=(m - 1), size=10, dtype=np.uint64), np.random.randint(low=1, high=5, size=10))) kwargs = {'a': 'b', 'c': 1} kwargs2 = {'a': 'method', 'c': 1} @@ -396,7 +408,7 @@ def test_save_and_load(tmpdir_factory, blank_noise_offsets): posterior._noise_count_regularized_posterior_coo = posterior_coo2 posterior._noise_count_regularized_posterior_kwargs = kwargs2 posterior._latents = {'p': np.random.randn(100), 'd': np.random.randn(100)} - posterior.index_converter = IndexConverter(total_n_cells=1000, total_n_genes=1000) + posterior.index_converter = IndexConverter(total_n_cells=max(1000, m // 1000 + 1), total_n_genes=1000) # save posterior.save(file=str(filename)) @@ -406,9 +418,14 @@ def test_save_and_load(tmpdir_factory, blank_noise_offsets): posterior2.load(file=str(filename)) # check - for attr in ['_noise_count_posterior_coo', '_noise_count_posterior_coo_offsets', - '_noise_count_posterior_kwargs', '_noise_count_regularized_posterior_coo', - '_noise_count_regularized_posterior_kwargs', '_latents']: + for attr in [ + '_noise_count_posterior_coo', + '_noise_count_posterior_coo_offsets', + '_noise_count_posterior_kwargs', + '_noise_count_regularized_posterior_coo', + '_noise_count_regularized_posterior_kwargs', + '_latents' + ]: val1 = getattr(posterior, attr) val2 = getattr(posterior2, attr) print(f'{attr} ===================')