diff --git a/diffxpy/api/utils.py b/diffxpy/api/utils.py index b2e4ded..101817c 100644 --- a/diffxpy/api/utils.py +++ b/diffxpy/api/utils.py @@ -1,4 +1,4 @@ -from diffxpy.testing.utils import constraint_matrix_from_string, constraint_matrix_from_dict, \ +from diffxpy.testing.utils import constraint_matrix_from_string, constraint_system_from_dict, \ constraint_system_from_star from diffxpy.testing.utils import design_matrix from diffxpy.testing.utils import view_coef_names, preview_coef_names diff --git a/diffxpy/fit/fit.py b/diffxpy/fit/fit.py index dc1c86b..c03ccc3 100644 --- a/diffxpy/fit/fit.py +++ b/diffxpy/fit/fit.py @@ -9,6 +9,7 @@ import pandas as pd import patsy import scipy.sparse +import dask from typing import Union, List, Dict, Callable, Tuple from .external import _fit @@ -17,7 +18,7 @@ def model( - data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.utils.input.InputDataGLM], formula_loc: Union[None, str] = None, formula_scale: Union[None, str] = "~1", as_numeric: Union[List[str], Tuple[str], str] = (), @@ -192,23 +193,23 @@ def model( ) design_loc, design_loc_names, constraints_loc, term_names_loc = constraint_system_from_star( + constraints=constraints_loc, dmat=dmat_loc, sample_description=sample_description, formula=formula_loc, as_numeric=as_numeric, - constraints=constraints_loc, return_type="patsy" ) design_scale, design_scale_names, constraints_scale, term_names_scale = constraint_system_from_star( + constraints=constraints_scale, dmat=dmat_scale, sample_description=sample_description, formula=formula_scale, as_numeric=as_numeric, - constraints=constraints_scale, return_type="patsy" ) - model = _fit( + estim = _fit( noise_model=noise_model, data=data, design_loc=design_loc, @@ -226,11 +227,11 @@ def model( quick_scale=quick_scale, dtype=dtype ) - return model + return estim def residuals( - data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.utils.input.InputDataGLM], formula_loc: Union[None, str] = None, formula_scale: Union[None, str] = "~1", as_numeric: Union[List[str], Tuple[str], str] = (), @@ -401,12 +402,12 @@ def residuals( dtype=dtype, ** kwargs ) - residuals = estim.x - estim.model.location + residuals = estim.model_container.x - estim.model_container.location return residuals def partition( - data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.utils.input.InputDataGLM], parts: Union[str, np.ndarray, list], gene_names: Union[np.ndarray, list] = None, sample_description: pd.DataFrame = None, @@ -460,7 +461,7 @@ class _Partition: def __init__( self, - data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.utils.input.InputDataGLM], parts: Union[str, np.ndarray, list], gene_names: Union[np.ndarray, list] = None, sample_description: pd.DataFrame = None, @@ -487,12 +488,14 @@ def __init__( same order as in data or string-type column identifier of size-factor containing column in sample description. """ - if isinstance(data, glm.typing.InputDataBase): + if isinstance(data, glm.utils.input.InputDataGLM): self.x = data.x elif isinstance(data, anndata.AnnData) or isinstance(data, Raw): self.x = data.X elif isinstance(data, np.ndarray): self.x = data + elif isinstance(data, dask.array.core.Array): + self.x = data.compute() # ? else: raise ValueError("data type %s not recognized" % type(data)) self.gene_names = parse_gene_names(data, gene_names) diff --git a/diffxpy/pkg_constants.py b/diffxpy/pkg_constants.py index 12dfea2..4d99f4e 100644 --- a/diffxpy/pkg_constants.py +++ b/diffxpy/pkg_constants.py @@ -13,6 +13,6 @@ BATCHGLM_PROVIDE_FIM = True BATCHGLM_PROVIDE_HESSIAN = False -BATCHGLM_BACKEND = "tf1" +BATCHGLM_BACKEND = "numpy" BATCHGLM_FEATUREWISE = True BATCHGLM_AUTOGRAD = True diff --git a/diffxpy/testing/det.py b/diffxpy/testing/det.py index f512335..15f826f 100644 --- a/diffxpy/testing/det.py +++ b/diffxpy/testing/det.py @@ -13,6 +13,9 @@ import scipy.sparse import sparse from typing import Union, Dict, Tuple, List, Set +from batchglm.models.glm_norm import Model +from batchglm.utils.input import InputDataGLM +from batchglm.train.numpy.glm_norm import Estimator from .utils import split_x, dmat_unique from ..stats import stats @@ -465,17 +468,17 @@ class DifferentialExpressionTestLRT(_DifferentialExpressionTestSingle): sample_description: pd.DataFrame full_design_loc_info: patsy.design_info - full_estim: glm.typing.EstimatorBaseTyping + full_estim: glm.train.base.BaseEstimatorGlm reduced_design_loc_info: patsy.design_info - reduced_estim: glm.typing.EstimatorBaseTyping + reduced_estim: glm.train.base.BaseEstimatorGlm def __init__( self, sample_description: pd.DataFrame, full_design_loc_info: patsy.design_info, - full_estim: glm.typing.EstimatorBaseTyping, + full_estim: glm.train.base.BaseEstimatorGlm, reduced_design_loc_info: patsy.design_info, - reduced_estim: glm.typing.EstimatorBaseTyping + reduced_estim: glm.train.base.BaseEstimatorGlm ): super().__init__() self.sample_description = sample_description @@ -486,31 +489,42 @@ def __init__( @property def gene_ids(self) -> np.ndarray: - return np.asarray(self.full_estim.input_data.features) + return np.asarray(self.full_estim.model_container.features) @property def x(self): - return self.full_estim.x + return self.full_estim.model_container.x @property def reduced_model_gradient(self): - return self.reduced_estim.jacobian + return np.sum( + np.abs(self.reduced_estim.model_container.jac.compute() / self.reduced_estim.model_container.x.shape[0]), axis=1 + ) @property def full_model_gradient(self): - return self.full_estim.jacobian + return np.sum( + np.abs(self.full_estim.model_container.jac.compute() / self.full_estim.model_container.x.shape[0]), + axis=1 + ) def _test(self): - if np.any(self.full_estim.log_likelihood < self.reduced_estim.log_likelihood): + ll_full = self.full_estim.model_container.ll_byfeature + ll_reduced = self.reduced_estim.model_container.ll_byfeature + if isinstance(ll_full, dask.array.core.Array): + ll_full = ll_full.compute() + if isinstance(ll_reduced, dask.array.core.Array): + ll_reduced = ll_reduced.compute() + if np.any(ll_full < ll_reduced): logger.warning("Test assumption failed: full model is (partially) less probable than reduced model") return stats.likelihood_ratio_test( - ll_full=self.full_estim.log_likelihood, - ll_reduced=self.reduced_estim.log_likelihood, - df_full=self.full_estim.input_data.constraints_loc.shape[1] + - self.full_estim.input_data.constraints_scale.shape[1], - df_reduced=self.reduced_estim.input_data.constraints_loc.shape[1] + - self.reduced_estim.input_data.constraints_scale.shape[1], + ll_full=ll_full, + ll_reduced=ll_reduced, + df_full=self.full_estim.model_container.constraints_loc.shape[1] + + self.full_estim.model_container.constraints_scale.shape[1], + df_reduced=self.reduced_estim.model_container.constraints_loc.shape[1] + + self.reduced_estim.model_container.constraints_scale.shape[1], ) def _ave(self): @@ -520,7 +534,7 @@ def _ave(self): :return: np.ndarray """ - return np.asarray(np.mean(self.full_estim.x, axis=0)).flatten() + return np.asarray(np.mean(self.full_estim.model_container.x, axis=0)).flatten() def _log_fold_change(self, factors: Union[Dict, Tuple, Set, List], base=np.e): """ @@ -539,7 +553,7 @@ def _log_fold_change(self, factors: Union[Dict, Tuple, Set, List], base=np.e): di = self.full_design_loc_info sample_description = self.sample_description[[f.name() for f in di.subset(factors).factor_infos]] - dmat = self.full_estim.input_data.design_loc + dmat = self.full_estim.model_container.design_loc # make rows unique dmat, sample_description = dmat_unique(dmat, sample_description) @@ -558,7 +572,7 @@ def _log_fold_change(self, factors: Union[Dict, Tuple, Set, List], base=np.e): # make the design matrix + sample description unique again dmat, sample_description = dmat_unique(dmat, sample_description) - locations = self.full_estim.model.inverse_link_loc(np.matmul(dmat, self.full_estim.model.a)) + locations = self.full_estim.model_container.inverse_link_loc(np.matmul(dmat, self.full_estim.model_container.theta_location_constrained)) locations = np.log(locations) / np.log(base) dist = np.expand_dims(locations, axis=0) @@ -612,12 +626,12 @@ def locations(self): di = self.full_design_loc_info sample_description = self.sample_description[[f.name() for f in di.factor_infos]] - dmat = self.full_estim.input_data.design_loc + dmat = self.full_estim.model_container.design_loc dmat, sample_description = dmat_unique(dmat, sample_description) - retval = self.full_estim.model.inverse_link_loc(np.matmul(dmat, self.full_estim.model.a)) - retval = pd.DataFrame(retval, columns=self.full_estim.input_data.features) + retval = self.full_estim.model_container.inverse_link_loc(np.matmul(dmat, self.full_estim.model_container.theta_location_constrained)) + retval = pd.DataFrame(retval, columns=self.full_estim.model_container.features) for col in sample_description: retval[col] = sample_description[col] @@ -634,12 +648,12 @@ def scales(self): di = self.full_design_loc_info sample_description = self.sample_description[[f.name() for f in di.factor_infos]] - dmat = self.full_estim.input_data.design_scale + dmat = self.full_estim.model_container.design_scale dmat, sample_description = dmat_unique(dmat, sample_description) - retval = self.full_estim.inverse_link_scale(dmat.doc(self.full_estim.par_link_scale)) - retval = pd.DataFrame(retval, columns=self.full_estim.input_data.features) + retval = self.full_estim.model_container.inverse_link_scale(dmat.doc(self.full_estim.par_link_scale)) + retval = pd.DataFrame(retval, columns=self.full_estim.model_container.features) for col in sample_description: retval[col] = sample_description[col] @@ -684,7 +698,7 @@ class DifferentialExpressionTestWald(_DifferentialExpressionTestSingle): Single wald test per gene. """ - model_estim: glm.typing.EstimatorBaseTyping + model_estim: glm.train.base.BaseEstimatorGlm sample_description: pd.DataFrame coef_loc_totest: np.ndarray theta_mle: np.ndarray @@ -694,7 +708,7 @@ class DifferentialExpressionTestWald(_DifferentialExpressionTestSingle): def __init__( self, - model_estim: glm.typing.EstimatorBaseTyping, + model_estim: glm.train.base.BaseEstimatorGlm, col_indices: np.ndarray, noise_model: str, sample_description: pd.DataFrame @@ -712,16 +726,16 @@ def __init__( self._store_ols = None try: - if self.model_estim.error_codes is not None: - self._error_codes = self.model_estim.error_codes + if self.model_estim.model_container.error_codes is not None: + self._error_codes = self.model_estim.model_container.error_codes else: self._error_codes = None except Exception as e: self._error_codes = None try: - if self.model_estim.niter is not None: - self._niter = self.model_estim.niter + if self.model_estim.model_container.niter is not None: + self._niter = self.model_estim.model_container.niter else: self._niter = None except Exception as e: @@ -729,15 +743,17 @@ def __init__( @property def gene_ids(self) -> np.ndarray: - return np.asarray(self.model_estim.input_data.features) + return np.asarray(self.model_estim.model_container.features) @property def x(self): - return self.model_estim.x + return self.model_estim.model_container.x @property def model_gradient(self): - return self.model_estim.jacobian + return np.sum( + np.abs(self.model_estim.model_container.jac.compute() / self.model_estim.model_container.x.shape[0]), axis=1 + ) def log_fold_change(self, base=np.e, **kwargs): """ @@ -753,16 +769,16 @@ def log_fold_change(self, base=np.e, **kwargs): # loc = dmat @ self.model_estim.par_link_loc[self.coef_loc_totest] # return loc[1] - loc[0] if len(self.coef_loc_totest) == 1: - return self.model_estim.a_var[self.coef_loc_totest][0] + return self.model_estim.model_container.theta_location[self.coef_loc_totest][0] else: - idx0 = np.argmax(np.abs(self.model_estim.a_var[self.coef_loc_totest]), axis=0) + idx0 = np.argmax(np.abs(self.model_estim.model_container.theta_location[self.coef_loc_totest]), axis=0) idx1 = np.arange(len(idx0)) - # Leave the below for debugging right now, dask has different indexing than numpy does here: - assert not isinstance(self.model_estim.a_var, dask.array.core.Array), \ - "self.model_estim.a_var was dask array, aborting. Please file issue on github." - # Use advanced numpy indexing here: - # https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#advanced-indexing - return self.model_estim.a_var[self.coef_loc_totest, :][tuple(idx0), tuple(idx1)] + if isinstance(self.model_estim.model_container.theta_location, dask.array.core.Array): + return self.model_estim.model_container.theta_location[self.coef_loc_totest, :].vindex[idx0.compute(), idx1].T + else: + # Use advanced numpy indexing here: + # https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#advanced-indexing + return self.model_estim.model_container.theta_location[self.coef_loc_totest, :][tuple(idx0), tuple(idx1)] def _ll(self): """ @@ -770,7 +786,7 @@ def _ll(self): :return: np.ndarray """ - return self.model_estim.log_likelihood + return self.model_estim.model_container.ll_byfeature def _ave(self): """ @@ -778,7 +794,11 @@ def _ave(self): :return: np.ndarray """ - return np.asarray(self.x.mean(axis=0)).flatten() + # https://github.com/dask/dask/issues/7169 + x = self.x + if isinstance(x, dask.array.core.Array): + x = x.compute() + return np.asarray(x.mean(axis=0)).flatten() def _test(self): """ @@ -789,11 +809,11 @@ def _test(self): # Check whether single- or multiple parameters are tested. # For a single parameter, the wald statistic distribution is approximated # with a normal distribution, for multiple parameters, a chi-square distribution is used. - self.theta_mle = self.model_estim.a_var[self.coef_loc_totest] + self.theta_mle = self.model_estim.model_container.theta_location[self.coef_loc_totest] if len(self.coef_loc_totest) == 1: self.theta_mle = self.theta_mle[0] - self.theta_sd = self.model_estim.fisher_inv[:, self.coef_loc_totest[0], self.coef_loc_totest[0]] - self.theta_sd = np.nextafter(0, np.inf, out=self.theta_sd, where=self.theta_sd < np.nextafter(0, np.inf)) + self.theta_sd = self.model_estim.model_container.fisher_inv[:, self.coef_loc_totest[0], self.coef_loc_totest[0]] + self.theta_sd = np.nextafter(0, np.inf, self.theta_sd, where=self.theta_sd < np.nextafter(0, np.inf)) self.theta_sd = np.sqrt(self.theta_sd) return stats.wald_test( theta_mle=self.theta_mle, @@ -801,12 +821,12 @@ def _test(self): theta0=0 ) else: - self.theta_sd = np.diagonal(self.model_estim.fisher_inv, axis1=-2, axis2=-1).copy() + self.theta_sd = np.diagonal(self.model_estim.model_container.fisher_inv, axis1=-2, axis2=-1).copy() self.theta_sd = np.nextafter(0, np.inf, out=self.theta_sd, where=self.theta_sd < np.nextafter(0, np.inf)) self.theta_sd = np.sqrt(self.theta_sd) return stats.wald_test_chisq( theta_mle=self.theta_mle, - theta_covar=self.model_estim.fisher_inv[:, self.coef_loc_totest, :][:, :, self.coef_loc_totest], + theta_covar=self.model_estim.model_container.fisher_inv[:, self.coef_loc_totest, :][:, :, self.coef_loc_totest], theta0=0 ) @@ -877,15 +897,15 @@ def plot_vs_ttest( plt.ioff() - grouping = np.asarray(self.model_estim.input_data.design_loc[:, self.coef_loc_totest]) + grouping = np.asarray(self.model_estim.model_container.design_loc[:, self.coef_loc_totest]) # Normalize by size factors that were used in regression. - if self.model_estim.input_data.size_factors is not None: - sf = np.broadcast_to(np.expand_dims(self.model_estim.input_data.size_factors, axis=1), - shape=self.model_estim.x.shape) + if self.model_estim.model_container.size_factors is not None: + sf = np.broadcast_to(np.expand_dims(self.model_estim.model_container.size_factors, axis=1), + shape=self.model_estim.model_container.x.shape) else: - sf = np.ones(shape=(self.model_estim.x.shape[0], 1)) + sf = np.ones(shape=(self.model_estim.model_container.x.shape[0], 1)) ttest = t_test( - data=self.model_estim.x / sf, + data=self.model_estim.model_container.x / sf, grouping=grouping, gene_names=self.gene_ids, ) @@ -951,27 +971,27 @@ def plot_comparison_ols_coef( import matplotlib.pyplot as plt from matplotlib import gridspec from matplotlib import rcParams - from batchglm.api.models.tf1.glm_norm import Estimator, InputDataGLM + plt.ioff() # Run OLS model fit to have comparison coefficients. if self._store_ols is None: input_data_ols = InputDataGLM( - data=self.model_estim.input_data.data, - design_loc=self.model_estim.input_data.design_loc, - design_scale=self.model_estim.input_data.design_scale[:, [0]], - constraints_loc=self.model_estim.input_data.constraints_loc, - constraints_scale=self.model_estim.input_data.constraints_scale[[0], [0]], - size_factors=self.model_estim.input_data.size_factors, - feature_names=self.model_estim.input_data.features, + data=self.model_estim.model_container.data, + design_loc=self.model_estim.model_container.design_loc, + design_scale=self.model_estim.model_container.design_scale[:, [0]], + constraints_loc=self.model_estim.model_container.constraints_loc, + constraints_scale=self.model_estim.model_container.constraints_scale[[0], [0]], + size_factors=self.model_estim.model_container.size_factors, + feature_names=self.model_estim.model_container.features, ) + model = Model(input_data=input_data_ols) estim_ols = Estimator( - input_data=input_data_ols, + model=model, init_model=None, init_a="standard", init_b="standard", - dtype=self.model_estim.a_var.dtype ) estim_ols.initialize() store_ols = estim_ols.finalize() @@ -980,26 +1000,26 @@ def plot_comparison_ols_coef( store_ols = self._store_ols # Prepare parameter summary of both model fits. - par_loc = self.model_estim.input_data.data.coords["design_loc_params"].values + par_loc = self.model_estim.model_container.data.coords["design_loc_params"].values - a_var_ols = store_ols.a_var - a_var_ols[1:, :] = (a_var_ols[1:, :] + a_var_ols[[0], :]) / a_var_ols[[0], :] + theta_location_ols = store_ols.model_container.theta_location + theta_location_ols[1:, :] = (theta_location_ols[1:, :] + theta_location_ols[[0], :]) / theta_location_ols[[0], :] - a_var_user = self.model_estim.a_var + theta_location_user = self.model_estim.model_container.theta_location # Translate coefficients from both fits to be multiplicative in identity space. if self.noise_model == "nb": - a_var_user = np.exp(a_var_user) # self.model_estim.inverse_link_loc(a_var_user) + theta_location_user = np.exp(theta_location_user) # self.model_estim.inverse_link_loc(theta_location_user) elif self.noise_model == "norm": - a_var_user[1:, :] = (a_var_user[1:, :] + a_var_user[[0], :]) / a_var_user[[0], :] + theta_location_user[1:, :] = (theta_location_user[1:, :] + theta_location_user[[0], :]) / theta_location_user[[0], :] else: raise ValueError("noise model %s not yet supported for plot_comparison_ols" % self.noise_model) summaries_fits = [ pd.DataFrame({ - "user": a_var_user[i, :], - "ols": a_var_ols[i, :], + "user": theta_location_user[i, :], + "ols": theta_location_ols[i, :], "coef": par_loc[i] - }) for i in range(self.model_estim.a_var.shape[0]) + }) for i in range(self.model_estim.model_container.theta_location.shape[0]) ] plt.ioff() @@ -1090,27 +1110,26 @@ def plot_comparison_ols_pred( import matplotlib.pyplot as plt from matplotlib import gridspec from matplotlib import rcParams - from batchglm.api.models.tf1.glm_norm import Estimator, InputDataGLM plt.ioff() # Run OLS model fit to have comparison coefficients. if self._store_ols is None: input_data_ols = InputDataGLM( - data=self.model_estim.input_data.data, - design_loc=self.model_estim.input_data.design_loc, - design_scale=self.model_estim.input_data.design_scale[:, [0]], - constraints_loc=self.model_estim.input_data.constraints_loc, - constraints_scale=self.model_estim.input_data.constraints_scale[[0], [0]], - size_factors=self.model_estim.input_data.size_factors, - feature_names=self.model_estim.input_data.features, + data=self.model_estim.model_container.data, + design_loc=self.model_estim.model_container.design_loc, + design_scale=self.model_estim.model_container.design_scale[:, [0]], + constraints_loc=self.model_estim.model_container.constraints_loc, + constraints_scale=self.model_estim.model_container.constraints_scale[[0], [0]], + size_factors=self.model_estim.model_container.size_factors, + feature_names=self.model_estim.model_container.features, ) + model = Model(input_data=input_data_ols) estim_ols = Estimator( - input_data=input_data_ols, + model=model, init_model=None, init_a="standard", init_b="standard", - dtype=self.model_estim.a_var.dtype ) estim_ols.initialize() store_ols = estim_ols.finalize() @@ -1139,16 +1158,16 @@ def plot_comparison_ols_pred( pred_n_cells = sample( population=list(np.arange(0, self.model_estim.X.shape[0])), - k=np.min([20, self.model_estim.input_data.design_loc.shape[0]]) + k=np.min([20, self.model_estim.model_container.design_loc.shape[0]]) ) x = np.asarray(self.model_estim.X[pred_n_cells, :]).flatten() - y_user = self.model_estim.model.inverse_link_loc( - np.matmul(self.model_estim.input_data.design_loc[pred_n_cells, :], self.model_estim.a_var).flatten() + y_user = self.model_estim.model_container.inverse_link_loc( + np.matmul(self.model_estim.model_container.design_loc[pred_n_cells, :], self.model_estim.model_container.theta_location).flatten() ) - y_ols = store_ols.inverse_link_loc( - np.matmul(store_ols.design_loc[pred_n_cells, :], store_ols.a_var).flatten() + y_ols = store_ols.model_container.inverse_link_loc( + np.matmul(store_ols.model_container.design_loc[pred_n_cells, :], store_ols.model_container.theta_location).flatten() ) if log1p_transform: x = np.log(x+1) @@ -1247,10 +1266,10 @@ def _assemble_gene_fits( summaries_genes = [] for i, g in enumerate(gene_names): - assert g in self.model_estim.input_data.features, "gene %g not found" % g - g_idx = self.model_estim.input_data.features.index(g) + assert g in self.model_estim.model_container.features, "gene %g not found" % g + g_idx = self.model_estim.model_container.features.index(g) # Raw data for boxplot: - y = self.model_estim.x[:, g_idx] + y = self.model_estim.model_container.x[:, g_idx] if isinstance(y, dask.array.core.Array): y = y.compute() if isinstance(y, scipy.sparse.spmatrix) or isinstance(y, sparse.COO): @@ -1268,6 +1287,10 @@ def _assemble_gene_fits( loc=loc, scale=scale ) + elif self.noise_model == "poisson": + yhat = np.random.poisson( + lam=loc + ) else: raise ValueError("noise model %s not yet supported for plot_gene_fits" % self.noise_model) @@ -1554,7 +1577,7 @@ def __init__( super().__init__() if isinstance(data, anndata.AnnData) or isinstance(data, anndata.Raw): data = data.X - elif isinstance(data, glm.typing.InputDataBase): + elif isinstance(data, glm.utils.input.InputDataGLM): data = data.x self._x = data self.sample_description = sample_description @@ -1681,7 +1704,7 @@ def __init__( super().__init__() if isinstance(data, anndata.AnnData) or isinstance(data, anndata.Raw): data = data.X - elif isinstance(data, glm.typing.InputDataBase): + elif isinstance(data, glm.utils.input.InputDataGLM): data = data.x self._x = data self.sample_description = sample_description @@ -1870,7 +1893,10 @@ def summary(self, **kwargs) -> pd.DataFrame: # next, get argmax of flattened logfc and unravel the true indices from it r, c = np.unravel_index(flat_logfc.argmax(0), raw_logfc.shape[:2]) # if logfc is maximal in the lower triangular matrix, multiply it with -1 - logfc = raw_logfc[r, c, np.arange(raw_logfc.shape[-1])] * np.where(r <= c, 1, -1) + if isinstance(raw_logfc, dask.array.core.Array): + logfc = raw_logfc.vindex[r.compute(), c.compute(), np.arange(raw_logfc.shape[-1])] * np.where(r <= c, 1, -1) + else: + logfc = raw_logfc[r, c, np.arange(raw_logfc.shape[-1])] * np.where(r <= c, 1, -1) res = pd.DataFrame({ "gene": self.gene_ids, diff --git a/diffxpy/testing/det_cont.py b/diffxpy/testing/det_cont.py index 55e3719..7addb3d 100644 --- a/diffxpy/testing/det_cont.py +++ b/diffxpy/testing/det_cont.py @@ -20,7 +20,7 @@ class _DifferentialExpressionTestCont(_DifferentialExpressionTestSingle): _de_test: _DifferentialExpressionTestSingle - _model_estim: glm.typing.EstimatorBaseTyping + _model_estim: glm.train.base.BaseEstimatorGlm _size_factors: np.ndarray _continuous_coords: np.ndarray _spline_coefs: list @@ -28,7 +28,7 @@ class _DifferentialExpressionTestCont(_DifferentialExpressionTestSingle): def __init__( self, de_test: _DifferentialExpressionTestSingle, - model_estim: glm.typing.EstimatorBaseTyping, + model_estim: glm.train.base.BaseEstimatorGlm, size_factors: np.ndarray, continuous_coords: np.ndarray, spline_coefs: list, @@ -150,8 +150,8 @@ def _filter_genes_str(self, genes: list): :param genes: List of genes to filter. :return: Filtered list of genes """ - genes_found = np.array([x in self.gene_ids for x in genes]) - if any(np.logical_not(genes_found)): + genes_found = np.array([idx for idx, x in enumerate(genes) if x in self.gene_ids]) + if len(genes_found) < len(genes): logger.info("did not find some genes, omitting") genes = genes[genes_found] return genes @@ -163,8 +163,8 @@ def _filter_genes_int(self, genes: list): :param genes: List of genes to filter. :return: Filtered list of genes """ - genes_found = np.array([x < self.x.shape[1] for x in genes]) - if any(np.logical_not(genes_found)): + genes_found = np.array([idx for idx, x in enumerate(genes) if x < self.x.shape[1]]) + if len(genes_found) < len(genes): logger.info("did not find some genes, omitting") genes = genes[genes_found] return genes @@ -197,7 +197,7 @@ def _spline_par_loc_idx(self, intercept=True): :param intercept: Whether to include intercept. :return: Indices of spline basis parameters of location model. """ - par_loc_names = self._model_estim.input_data.loc_names + par_loc_names = self._model_estim.model_container.loc_names idx = [par_loc_names.index(x) for x in self._spline_coefs] if 'Intercept' in par_loc_names and intercept: idx = np.concatenate([np.where([[x == 'Intercept' for x in par_loc_names]])[0], idx]) @@ -218,14 +218,14 @@ def _continuous_model(self, idx, non_numeric=False): idx = np.array([idx]) if non_numeric: - mu = np.matmul(self._model_estim.input_data.design_loc, - self._model_estim.model.a[:, idx]) + mu = np.matmul(self._model_estim.model_container.design_loc, + self._model_estim.model_container.theta_location_constrained[:, idx]) if self._size_factors is not None: - mu = mu + self._model_estim.input_data.size_factors + mu = mu + self._model_estim.model_container.size_factors else: idx_basis = self._spline_par_loc_idx(intercept=True) - mu = np.matmul(self._model_estim.input_data.design_loc[:, idx_basis], - self._model_estim.model.a[idx_basis, :][:, idx]) + mu = np.matmul(self._model_estim.model_container.design_loc[:, idx_basis], + self._model_estim.model_container.theta_location_constrained[idx_basis, :][:, idx]) if isinstance(mu, dask.array.core.Array): mu = mu.compute() @@ -246,7 +246,7 @@ def _continuous_interpolation(self, idx): idx = np.array([idx]) idx_basis = self._spline_par_loc_idx(intercept=True) - a = self._model_estim.model.a[idx_basis, :] + a = self._model_estim.model_container.theta_location_constrained[idx_basis, :] if isinstance(a, dask.array.core.Array): a = a.compute()[:, idx] else: @@ -393,8 +393,8 @@ def plot_genes( y = y.compute() if isinstance(y, scipy.sparse.spmatrix) or isinstance(y, sparse.COO): y = np.asarray(y.todense()).flatten() - if self._model_estim.input_data.size_factors is not None: - y = y / self._model_estim.input_data.size_factors + if self._model_estim.model_container.size_factors is not None: + y = y / self._model_estim.model_container.size_factors t_continuous, yhat = self._continuous_interpolation(idx=g) yhat = yhat.flatten() if scalings is not None: @@ -402,7 +402,7 @@ def plot_genes( [yhat], [ yhat * np.expand_dims( - np.exp(self._model_estim.a_var[self._model_estim.input_data.loc_names.index(x), g]), + np.exp(self._model_estim.a_var[self._model_estim.model_container.loc_names.index(x), g]), axis=0 ) for i, x in enumerate(scalings) diff --git a/diffxpy/testing/det_pair.py b/diffxpy/testing/det_pair.py index 3abc2e0..e11e605 100644 --- a/diffxpy/testing/det_pair.py +++ b/diffxpy/testing/det_pair.py @@ -341,13 +341,13 @@ class DifferentialExpressionTestZTest(_DifferentialExpressionTestPairwiseBase): lazy test evaluation. """ - model_estim: glm.typing.EstimatorBaseTyping + model_estim: glm.train.base.BaseEstimatorGlm theta_mle: np.ndarray theta_sd: np.ndarray def __init__( self, - model_estim: glm.typing.EstimatorBaseTyping, + model_estim: glm.train.base.BaseEstimatorGlm, grouping, groups, correction_type: str @@ -358,10 +358,10 @@ def __init__( self.groups = list(np.asarray(groups)) # Values of parameter estimates: coefficients x genes array with one coefficient per group - self._theta_mle = model_estim.a_var + self._theta_mle = model_estim.model_container.theta_location # Standard deviation of estimates: coefficients x genes array with one coefficient per group # Need .copy() here as nextafter needs mutabls copy. - theta_sd = np.diagonal(model_estim.fisher_inv, axis1=-2, axis2=-1).T.copy() + theta_sd = np.diagonal(model_estim.model_container.fisher_inv, axis1=-2, axis2=-1).T.copy() theta_sd = np.nextafter(0, np.inf, out=theta_sd, where=theta_sd < np.nextafter(0, np.inf)) self._theta_sd = np.sqrt(theta_sd) self._logfc = None @@ -371,7 +371,7 @@ def __init__( _ = self.qval def _test(self, **kwargs): - pvals = np.tile(np.NaN, [len(self.groups), len(self.groups), self.model_estim.x.shape[1]]) + pvals = np.tile(np.NaN, [len(self.groups), len(self.groups), self.model_estim.model_container.x.shape[1]]) for i, g1 in enumerate(self.groups): for j, g2 in enumerate(self.groups[(i + 1):]): j = j + i + 1 @@ -387,19 +387,21 @@ def _test(self, **kwargs): @property def gene_ids(self) -> np.ndarray: - return np.asarray(self.model_estim.input_data.features) + return np.asarray(self.model_estim.model_container.features) @property def x(self): - return self.model_estim.x + return self.model_estim.model_container.x @property def log_likelihood(self): - return self.model_estim.log_likelihood + return self.model_estim.model_container.ll_byfeature @property def model_gradient(self): - return self.model_estim.jacobian + return np.sum( + np.abs(self.model_estim.model_container.jac.compute() / self.model_estim.model_container.x.shape[0]), axis=1 + ) def _ave(self): """ @@ -408,7 +410,7 @@ def _ave(self): :return: np.ndarray """ - return np.asarray(np.mean(self.model_estim.x, axis=0)).flatten() + return np.asarray(np.mean(self.model_estim.model_container.x, axis=0)).flatten() def _pval_pairs(self, idx0, idx1): """ @@ -431,7 +433,7 @@ def _log_fold_change_pairs(self, idx0, idx1, base): :param base: Base of logarithm. :return: log fold change values """ - logfc = np.tile(np.NaN, [len(idx0), len(idx1), self.model_estim.x.shape[1]]) + logfc = np.tile(np.NaN, [len(idx0), len(idx1), self.model_estim.model_container.x.shape[1]]) for i, xi in enumerate(idx0): for j, xj in enumerate(idx1): logfc[i, j, :] = self._theta_mle[xj, :] - self._theta_mle[xi, :] @@ -526,13 +528,13 @@ class DifferentialExpressionTestZTestLazy(_DifferentialExpressionTestPairwiseLaz memory. """ - model_estim: glm.typing.EstimatorBaseTyping + model_estim: glm.train.base.BaseEstimatorGlm _theta_mle: np.ndarray _theta_sd: np.ndarray def __init__( self, - model_estim: glm.typing.EstimatorBaseTyping, + model_estim: glm.train.base.BaseEstimatorGlm, grouping, groups, correction_type="global" ): @@ -548,10 +550,10 @@ def __init__( self.groups = groups.tolist() # Values of parameter estimates: coefficients x genes array with one coefficient per group - self._theta_mle = model_estim.a_var + self._theta_mle = model_estim.model_container.theta_location # Standard deviation of estimates: coefficients x genes array with one coefficient per group # Need .copy() here as nextafter needs mutabls copy. - theta_sd = np.diagonal(model_estim.fisher_inv, axis1=-2, axis2=-1).T.copy() + theta_sd = np.diagonal(model_estim.model_container.fisher_inv, axis1=-2, axis2=-1).T.copy() theta_sd = np.nextafter(0, np.inf, out=theta_sd, where=theta_sd < np.nextafter(0, np.inf)) self._theta_sd = np.sqrt(theta_sd) @@ -563,10 +565,10 @@ def _test_pairs(self, idx0, idx1): :param idx1: List of indices of second set of group of observations in pair-wise comparison. :return: p-values """ - pvals = np.tile(np.NaN, [len(idx0), len(idx1), self.model_estim.x.shape[1]]) + pvals = np.tile(np.NaN, [len(idx0), len(idx1), self.model_estim.model_container.x.shape[1]]) for i, xi in enumerate(idx0): for j, xj in enumerate(idx1): - if i != j: + if xi != xj: pvals[i, j, :] = stats.two_coef_z_test( theta_mle0=self._theta_mle[xi, :], theta_mle1=self._theta_mle[xj, :], @@ -575,24 +577,25 @@ def _test_pairs(self, idx0, idx1): ) else: pvals[i, j, :] = np.array([1.]) - return pvals @property def gene_ids(self) -> np.ndarray: - return np.asarray(self.model_estim.input_data.features) + return np.asarray(self.model_estim.model_container.features) @property def x(self): - return self.model_estim.x + return self.model_estim.model_container.x @property def log_likelihood(self): - return self.model_estim.log_likelihood + return self.model_estim.model_container.ll_byfeature # should be by gene/feature? @property def model_gradient(self): - return self.model_estim.jacobian + return np.sum( + np.abs(self.model_estim.model_container.jac.compute() / self.model_estim.model_container.x.shape[0]), axis=1 + ) def _ave(self): """ @@ -600,7 +603,7 @@ def _ave(self): :return: np.ndarray """ - return np.asarray(np.mean(self.model_estim.x, axis=0)).flatten() + return np.asarray(np.mean(self.model_estim.model_container.x, axis=0)).flatten() def _log_fold_change_pairs(self, idx0, idx1, base): """ diff --git a/diffxpy/testing/tests.py b/diffxpy/testing/tests.py index f649179..d18fe30 100644 --- a/diffxpy/testing/tests.py +++ b/diffxpy/testing/tests.py @@ -11,6 +11,7 @@ import patsy import scipy.sparse from typing import Union, List, Dict, Callable, Tuple +from batchglm.utils.input import InputDataGLM from diffxpy import pkg_constants from .det import DifferentialExpressionTestLRT, DifferentialExpressionTestWald, \ @@ -129,41 +130,22 @@ def _fit( :param close_session: If True, will finalize the estimator. Otherwise, return the estimator itself. """ # Load estimator for required noise model and backend: - if backend.lower() in ["tf1"]: - if noise_model == "nb" or noise_model == "negative_binomial": - from batchglm.api.models.tf1.glm_nb import Estimator, InputDataGLM - elif noise_model == "norm" or noise_model == "normal": - from batchglm.api.models.tf1.glm_norm import Estimator, InputDataGLM - else: - raise ValueError('noise_model="%s" not recognized.' % noise_model) - if batch_size is None: - batch_size = 128 - else: - if not isinstance(batch_size, int): - raise ValueError("batch_size has to be an integer if backend is tf1") - chunk_size_cells = int(1e9) - chunk_size_genes = 128 - elif backend.lower() in ["tf2"]: - if noise_model == "nb" or noise_model == "negative_binomial": - from batchglm.api.models.tf2.glm_nb import Estimator, InputDataGLM - else: - raise ValueError('noise_model="%s" not recognized.' % noise_model) - if batch_size is None: - batch_size = 128 - else: - if not isinstance(batch_size, int): - raise ValueError("batch_size has to be an integer if backend is tf2") - chunk_size_cells = int(1e9) - chunk_size_genes = 128 - elif backend.lower() in ["numpy"]: + if backend.lower() in ["numpy"]: if isinstance(training_strategy, str): if training_strategy.lower() == "auto": training_strategy = "DEFAULT" if noise_model == "nb" or noise_model == "negative_binomial": - from batchglm.api.models.numpy.glm_nb import Estimator, InputDataGLM + from batchglm.train.numpy.glm_nb import Estimator + from batchglm.models.glm_nb import Model + elif noise_model == "norm" or noise_model == "normal": + from batchglm.train.numpy.glm_norm import Estimator + from batchglm.models.glm_norm import Model + elif noise_model == "poisson": + from batchglm.train.numpy.glm_poisson import Estimator + from batchglm.models.glm_poisson import Model else: raise ValueError('noise_model="%s" not recognized.' % noise_model) - # Set default chunk size: + # Set default chunk size: if batch_size is None: chunk_size_cells = int(1e9) chunk_size_genes = 128 @@ -196,45 +178,23 @@ def _fit( constructor_args = {} if quick_scale is not None: constructor_args["quick_scale"] = quick_scale - # Backend-specific constructor arguments: - if backend.lower() in ["tf1"]: - constructor_args['provide_optimizers'] = { - "gd": pkg_constants.BATCHGLM_OPTIM_GD, - "adam": pkg_constants.BATCHGLM_OPTIM_ADAM, - "adagrad": pkg_constants.BATCHGLM_OPTIM_ADAGRAD, - "rmsprop": pkg_constants.BATCHGLM_OPTIM_RMSPROP, - "nr": pkg_constants.BATCHGLM_OPTIM_NEWTON, - "nr_tr": pkg_constants.BATCHGLM_OPTIM_NEWTON_TR, - "irls": pkg_constants.BATCHGLM_OPTIM_IRLS, - "irls_gd": pkg_constants.BATCHGLM_OPTIM_IRLS_GD, - "irls_tr": pkg_constants.BATCHGLM_OPTIM_IRLS_TR, - "irls_gd_tr": pkg_constants.BATCHGLM_OPTIM_IRLS_GD_TR - } - constructor_args['provide_batched'] = pkg_constants.BATCHGLM_PROVIDE_BATCHED - constructor_args['provide_fim'] = pkg_constants.BATCHGLM_PROVIDE_FIM - constructor_args['provide_hessian'] = pkg_constants.BATCHGLM_PROVIDE_HESSIAN - constructor_args["batch_size"] = batch_size elif backend.lower() not in ["tf2"]: pass elif backend.lower() not in ["numpy"]: pass else: raise ValueError('backend="%s" not recognized.' % backend) - + model = Model(input_data=input_data) estim = Estimator( - input_data=input_data, - init_a=init_a, - init_b=init_b, - dtype=dtype, - **constructor_args + model=model, + init_location=init_a, + init_scale=init_b ) estim.initialize() # Assemble backend specific key word arguments to training function: if batch_size is not None: train_args["batch_size"] = batch_size - if backend.lower() in ["tf1"]: - pass elif backend.lower() in ["tf2"]: train_args["autograd"] = pkg_constants.BATCHGLM_AUTOGRAD train_args["featurewise"] = pkg_constants.BATCHGLM_FEATUREWISE @@ -252,7 +212,7 @@ def _fit( def lrt( - data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.utils.input.InputDataGLM], full_formula_loc: str, reduced_formula_loc: str, full_formula_scale: str = "~1", @@ -371,28 +331,28 @@ def lrt( sample_description=sample_description ) - full_design_loc = glm.data.design_matrix( + full_design_loc, _ = glm.utils.data.design_matrix( sample_description=sample_description, formula=full_formula_loc, - as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values], + as_categorical=[x for x in sample_description.columns.values if x not in as_numeric], return_type="patsy" ) - reduced_design_loc = glm.data.design_matrix( + reduced_design_loc, _ = glm.utils.data.design_matrix( sample_description=sample_description, formula=reduced_formula_loc, - as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values], + as_categorical=[x for x in sample_description.columns.values if x not in as_numeric], return_type="patsy" ) - full_design_scale = glm.data.design_matrix( + full_design_scale, _ = glm.utils.data.design_matrix( sample_description=sample_description, formula=full_formula_scale, - as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values], + as_categorical=[x for x in sample_description.columns.values if x not in as_numeric], return_type="patsy" ) - reduced_design_scale = glm.data.design_matrix( + reduced_design_scale, _ = glm.utils.data.design_matrix( sample_description=sample_description, formula=reduced_formula_scale, - as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values], + as_categorical=[x for x in sample_description.columns.values if x not in as_numeric], return_type="patsy" ) @@ -423,8 +383,8 @@ def lrt( constraints_loc=None, constraints_scale=None, gene_names=gene_names, - init_a="init_model", - init_b="init_model", + init_a="auto", + init_b="auto", init_model=reduced_model, size_factors=size_factors, batch_size=batch_size, @@ -448,7 +408,7 @@ def lrt( def wald( - data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.utils.input.InputDataGLM], factor_loc_totest: Union[str, List[str]] = None, coef_to_test: Union[str, List[str]] = None, formula_loc: Union[None, str] = None, @@ -649,19 +609,19 @@ def wald( # Build design matrices and constraints. design_loc, design_loc_names, constraints_loc, term_names_loc = constraint_system_from_star( + constraints=constraints_loc, dmat=dmat_loc, sample_description=sample_description, formula=formula_loc, as_numeric=as_numeric, - constraints=constraints_loc, return_type="patsy" ) design_scale, design_scale_names, constraints_scale, term_names_scale = constraint_system_from_star( + constraints=constraints_scale, dmat=dmat_scale, sample_description=sample_description, formula=formula_scale, as_numeric=as_numeric, - constraints=constraints_scale, return_type="patsy" ) @@ -672,7 +632,7 @@ def wald( if not isinstance(design_loc, patsy.design_info.DesignMatrix): col_indices = np.where([ x in factor_loc_totest - for x in term_names_loc + for x in design_loc_names # should match the matrix it comes from? ])[0] else: # Select coefficients to test via formula model: @@ -721,7 +681,7 @@ def wald( col_indices = np.array([np.where(constraints_loc_temp[x, :] == 1)[0][0] for x in col_indices]) # Fit model. - model = _fit( + estim = _fit( noise_model=noise_model, data=data, design_loc=design_loc, @@ -745,7 +705,7 @@ def wald( # Prepare differential expression test. de_test = DifferentialExpressionTestWald( - model_estim=model, + model_estim=estim, col_indices=col_indices, noise_model=noise_model, sample_description=sample_description @@ -783,7 +743,7 @@ def wald_repeated( coef_to_test = [coef_to_test] # Check that design_loc is patsy, otherwise use term_names for slicing. - par_loc_names = det.model_estim.model.design_loc_names + par_loc_names = det.model_estim.model_container.design_loc_names if factor_loc_totest is not None and coef_to_test is None: col_indices = np.concatenate([np.where([ fac in x @@ -806,7 +766,7 @@ def wald_repeated( assert len(col_indices) > 0, "Could not find any matching columns!" # Check that all tested coefficients are independent: - constraints_loc = det.model_estim.model.constraints_loc + constraints_loc = det.model_estim.model_container.constraints_loc if isinstance(constraints_loc, dask.array.core.Array): constraints_loc = constraints_loc.compute() for x in col_indices: @@ -829,7 +789,7 @@ def wald_repeated( def t_test( - data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.utils.input.InputDataGLM], grouping, gene_names: Union[np.ndarray, list] = None, sample_description: pd.DataFrame = None, @@ -871,7 +831,7 @@ def t_test( def rank_test( - data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.utils.input.InputDataGLM], grouping: Union[str, np.ndarray, list], gene_names: Union[np.ndarray, list] = None, sample_description: pd.DataFrame = None, @@ -913,7 +873,7 @@ def rank_test( def two_sample( - data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.utils.input.InputDataGLM], grouping: Union[str, np.ndarray, list], as_numeric: Union[List[str], Tuple[str], str] = (), test: str = "t-test", @@ -1102,7 +1062,7 @@ def two_sample( def pairwise( - data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.utils.input.InputDataGLM], grouping: Union[str, np.ndarray, list], as_numeric: Union[List[str], Tuple[str], str] = (), test: str = "z-test", @@ -1238,19 +1198,24 @@ def pairwise( if test.lower() == 'z-test' or test.lower() == 'z_test' or test.lower() == 'ztest': # -1 in formula removes intercept - dmat = glm.data.design_matrix( + dmat_loc, _ = glm.utils.data.design_matrix( sample_description, formula="~ 1 - 1 + grouping" ) + # Only intercept scale model + dmat_scale, _ = glm.utils.data.design_matrix( + sample_description, + formula="~ 1" + ) model = _fit( noise_model=noise_model, data=data, - design_loc=dmat, - design_scale=dmat, + design_loc=dmat_loc, + design_scale=dmat_scale, gene_names=gene_names, size_factors=size_factors, init_a="closed_form", - init_b="closed_form", + init_b="auto", batch_size=batch_size, backend=backend, train_args=train_args, @@ -1277,7 +1242,7 @@ def pairwise( else: if isinstance(data, anndata.AnnData) or isinstance(data, anndata.Raw): data = data.X - elif isinstance(data, glm.typing.InputDataBase): + elif isinstance(data, glm.utils.input.InputDataGLM): data = data.x groups = np.unique(grouping) pvals = np.tile(np.NaN, [len(groups), len(groups), data.shape[1]]) @@ -1289,6 +1254,10 @@ def pairwise( tests = np.tile([None], [len(groups), len(groups)]) else: tests = None + + if test not in ["rank", "t-test"]: + kwargs["noise_model"] = noise_model + for i, g1 in enumerate(groups): for j, g2 in enumerate(groups[(i + 1):]): @@ -1305,7 +1274,6 @@ def pairwise( test=test, gene_names=gene_names, sample_description=sample_description.iloc[idx, :], - noise_model=noise_model, size_factors=size_factors[idx] if size_factors is not None else None, batch_size=batch_size, backend=backend, @@ -1338,7 +1306,7 @@ def pairwise( def versus_rest( - data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.utils.input.InputDataGLM], grouping: Union[str, np.ndarray, list], as_numeric: Union[List[str], Tuple[str], str] = (), test: str = 'wald', @@ -1514,7 +1482,7 @@ def versus_rest( def partition( - data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.utils.input.InputDataGLM], parts: Union[str, np.ndarray, list], gene_names: Union[np.ndarray, list] = None, sample_description: pd.DataFrame = None @@ -1557,7 +1525,7 @@ class _Partition: def __init__( self, - data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.utils.input.InputDataGLM], parts: Union[str, np.ndarray, list], gene_names: Union[np.ndarray, list] = None, sample_description: pd.DataFrame = None @@ -1572,12 +1540,14 @@ def __init__( :param gene_names: optional list/array of gene names which will be used if `data` does not implicitly store these :param sample_description: optional pandas.DataFrame containing sample annotations """ - if isinstance(data, glm.typing.InputDataBase): + if isinstance(data, glm.utils.input.InputDataGLM): self.x = data.x elif isinstance(data, anndata.AnnData) or isinstance(data, Raw): self.x = data.X elif isinstance(data, np.ndarray): self.x = data + elif isinstance(data, dask.array.core.Array): + self.x = data.compute() else: raise ValueError("data type %s not recognized" % type(data)) self.gene_names = parse_gene_names(data, gene_names) @@ -1704,7 +1674,6 @@ def t_test( gene_names=self.gene_names, sample_description=self.sample_description.iloc[idx, :], is_sig_zerovar=is_sig_zerovar, - dtype=dtype )) return DifferentialExpressionTestByPartition( partitions=self.partitions, @@ -1740,7 +1709,6 @@ def rank_test( gene_names=self.gene_names, sample_description=self.sample_description.iloc[idx, :], is_sig_zerovar=is_sig_zerovar, - dtype=dtype )) return DifferentialExpressionTestByPartition( partitions=self.partitions, @@ -1978,7 +1946,7 @@ def wald( def continuous_1d( - data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.utils.input.InputDataGLM], continuous: str, factor_loc_totest: Union[str, List[str]], formula_loc: str, @@ -2184,7 +2152,7 @@ def continuous_1d( as_numeric = list(as_numeric) gene_names = parse_gene_names(data, gene_names) - sample_description = parse_sample_description(data, sample_description) + sample_description = parse_sample_description(data, sample_description).copy() # need copy to reset values. # Check that continuous factor is contained in sample description and is numeric. if continuous not in sample_description.columns: diff --git a/diffxpy/testing/utils.py b/diffxpy/testing/utils.py index 369d399..83bbb2a 100644 --- a/diffxpy/testing/utils.py +++ b/diffxpy/testing/utils.py @@ -4,6 +4,7 @@ except ImportError: from anndata import Raw import batchglm.api as glm +import dask import numpy as np import pandas as pd import patsy @@ -13,18 +14,19 @@ # Relay util functions for diffxpy api. # design_matrix, preview_coef_names and constraint_system_from_star are redefined here. -from batchglm.data import constraint_matrix_from_string, constraint_matrix_from_dict -from batchglm.data import view_coef_names +from batchglm.utils.data import constraint_matrix_from_string, constraint_system_from_dict +from batchglm.utils.data import view_coef_names def parse_gene_names( - data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.utils.input.InputDataGLM], gene_names: Union[list, np.ndarray, None] ): if gene_names is None: + print(data) if anndata is not None and (isinstance(data, anndata.AnnData) or isinstance(data, Raw)): gene_names = data.var_names - elif isinstance(data, glm.typing.InputDataBase): + elif isinstance(data, glm.utils.input.InputDataGLM): gene_names = data.features else: raise ValueError("Missing gene names") @@ -33,7 +35,7 @@ def parse_gene_names( def parse_sample_description( - data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.utils.input.InputDataGLM], sample_description: Union[pd.DataFrame, None] ) -> pd.DataFrame: """ @@ -58,7 +60,7 @@ def parse_sample_description( assert data.X.shape[0] == sample_description.shape[0], \ "data matrix and sample description must contain same number of cells: %i, %i" % \ (data.X.shape[0], sample_description.shape[0]) - elif isinstance(data, glm.typing.InputDataBase): + elif isinstance(data, glm.utils.input.InputDataGLM): assert data.x.shape[0] == sample_description.shape[0], \ "data matrix and sample description must contain same number of cells: %i, %i" % \ (data.x.shape[0], sample_description.shape[0]) @@ -71,7 +73,7 @@ def parse_sample_description( def parse_size_factors( size_factors: Union[np.ndarray, pd.core.series.Series, np.ndarray], - data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.typing.InputDataBase], + data: Union[anndata.AnnData, Raw, np.ndarray, scipy.sparse.csr_matrix, glm.utils.input.InputDataGLM], sample_description: pd.DataFrame ) -> Union[np.ndarray, None]: """ @@ -93,7 +95,7 @@ def parse_size_factors( if anndata is not None and isinstance(data, Raw): data_shape = data.X.shape - elif isinstance(data, glm.typing.InputDataBase): + elif isinstance(data, glm.utils.input.InputDataGLM): data_shape = data.x.shape else: data_shape = data.shape @@ -118,6 +120,8 @@ def split_x(data, grouping): def dmat_unique(dmat, sample_description): + if isinstance(dmat, dask.array.core.Array): + dmat = dmat.compute() dmat, idx = np.unique(dmat, axis=0, return_index=True) sample_description = sample_description.iloc[idx].reset_index(drop=True) @@ -135,7 +139,7 @@ def design_matrix( """ Create a design matrix from some sample description. This function defaults to perform formatting if dmat is directly supplied as a pd.DataFrame. - This function relays batchglm.data.design_matrix() to behave like the other wrappers in diffxpy. + This function relays batchglm.utils.data.design_matrix() to behave like the other wrappers in diffxpy. :param data: Input data matrix (observations x features) or (cells x genes). :param sample_description: pandas.DataFrame of length "num_observations" containing explanatory variables as columns @@ -164,11 +168,11 @@ def design_matrix( sample_description = parse_sample_description(data, sample_description) if sample_description is not None: - as_categorical = [False if x in as_numeric else True for x in sample_description.columns.values] + as_categorical = [x for x in sample_description.columns.values if x not in as_numeric] else: as_categorical = True - return glm.data.design_matrix( + return glm.utils.data.design_matrix( sample_description=sample_description, formula=formula, as_categorical=as_categorical, @@ -186,7 +190,7 @@ def preview_coef_names( Return coefficient names of model. Use this to preview what the model would look like. - This function relays batchglm.data.preview_coef_names() to behave like the other wrappers in diffxpy. + This function relays batchglm.utils.data.preview_coef_names() to behave like the other wrappers in diffxpy. :param sample_description: pandas.DataFrame of length "num_observations" containing explanatory variables as columns :param formula: model formula as string, describing the relations of the explanatory variables. @@ -205,25 +209,25 @@ def preview_coef_names( if isinstance(as_numeric, tuple): as_numeric = list(as_numeric) - return glm.data.preview_coef_names( + return glm.utils.data.preview_coef_names( sample_description=sample_description, formula=formula, - as_categorical=[False if x in as_numeric else True for x in sample_description.columns.values] + as_categorical=[x for x in sample_description.columns.values if x not in as_numeric] ) def constraint_system_from_star( + constraints: dict = {}, dmat: Union[None, patsy.design_info.DesignMatrix] = None, sample_description: Union[None, pd.DataFrame] = None, formula: Union[None, str] = None, as_numeric: Union[List[str], Tuple[str], str] = (), - constraints: dict = {}, return_type: str = "patsy", ) -> Tuple: """ Create a design matrix and a constraint matrix. - This function relays batchglm.data.constraint_matrix_from_star() to behave like the other wrappers in diffxpy. + This function relays batchglm.utils.data.constraint_matrix_from_star() to behave like the other wrappers in diffxpy. :param dmat: Pre-built model design matrix. :param sample_description: pandas.DataFrame of length "num_observations" containing explanatory variables as columns @@ -259,16 +263,16 @@ def constraint_system_from_star( as_numeric = list(as_numeric) if sample_description is not None: - as_categorical = [False if x in as_numeric else True for x in sample_description.columns.values] + as_categorical = [x for x in sample_description.columns.values if x not in as_numeric] else: as_categorical = True - return glm.data.constraint_system_from_star( + return glm.utils.data.constraint_system_from_star( + constraints, dmat=dmat, sample_description=sample_description, formula=formula, as_categorical=as_categorical, - constraints=constraints, return_type=return_type ) @@ -305,7 +309,7 @@ def bin_continuous_covariate( else: bins = np.arange(0, 1, 1 / bins) - fac_binned = glm.data.bin_continuous_covariate( + fac_binned = glm.utils.data.bin_continuous_covariate( sample_description=sd, factor_to_bin=factor_to_bin, bins=bins diff --git a/diffxpy/unit_test/test_acc_glm_all_numpy_temp.py b/diffxpy/unit_test/test_acc_glm_all_numpy_temp.py index cca96a0..02ff1de 100644 --- a/diffxpy/unit_test/test_acc_glm_all_numpy_temp.py +++ b/diffxpy/unit_test/test_acc_glm_all_numpy_temp.py @@ -1,9 +1,7 @@ import logging -import anndata import numpy as np -import scipy.sparse import unittest - +import scanpy as sc import batchglm.api as glm import diffxpy.api as de @@ -11,36 +9,40 @@ logger = logging.getLogger(__name__) -class TestAccuracyGlmNb( +class TestConvergence( unittest.TestCase ): """ Test whether optimizers yield exact results for negative binomial distributed data. """ - def test_full_nb(self): - logging.getLogger("batchglm").setLevel(logging.INFO) - logger.error("TestAccuracyGlmNb.test_full_nb()") - + def _test_full_model(self, noise_model): np.random.seed(1) - adata = anndata.read_h5ad("/Users/david.fischer/Desktop/test.h5ad") - TF = "Ascl1" + adata = sc.datasets.pbmc3k() + tf = "MALAT1" + ind = adata.var.index.get_loc(tf) + log_cd4 = sc.pp.log1p(adata[:, tf].X.todense()) + adata.obs[tf + "_log"] = log_cd4 temp = de.test.continuous_1d( - data=adata[:, :10], - formula_loc="~ 1 +" + TF + "_log", # + " + log_sf", - formula_scale="~ 1 +" + TF + "_log", # + " + log_sf", - factor_loc_totest=TF + "_log", - continuous=TF + "_log", - as_numeric=[TF + "_log"], # "log_sf"], + data=adata[:, (ind - 5):(ind + 5)], + formula_loc="~ 1 +" + tf + "_log", # + " + log_sf", + formula_scale="~ 1", + factor_loc_totest=tf + "_log", + continuous=tf + "_log", + as_numeric=[tf + "_log"], # "log_sf"], df=4, quick_scale=False, init_a="all_zero", size_factors=None, - noise_model="poisson", + noise_model=noise_model, backend="numpy" ) _ = temp.summary() + def test(self): + for noise_model in ['poisson', 'norm', 'nb']: + self._test_full_model(noise_model) + if __name__ == '__main__': unittest.main() diff --git a/diffxpy/unit_test/test_backends.py b/diffxpy/unit_test/test_backends.py index 4066b40..93e3d37 100644 --- a/diffxpy/unit_test/test_backends.py +++ b/diffxpy/unit_test/test_backends.py @@ -3,6 +3,9 @@ import numpy as np import pandas as pd import scipy.stats as stats +from batchglm.models.glm_nb import Model as NBModel +from batchglm.models.glm_norm import Model as NormModel +from batchglm.models.glm_poisson import Model as PoissonModel import diffxpy.api as de @@ -27,26 +30,32 @@ def _test_null_distribution_wald( :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": - from batchglm.api.models.numpy.glm_nb import Simulator + model = NBModel() rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": - from batchglm.api.models.numpy.glm_norm import Simulator + model = NormModel() rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) - else: - raise ValueError("noise model %s not recognized" % noise_model) + elif noise_model == "poisson": + model = PoissonModel() + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate_params(rand_fn_scale=rand_fn_scale) - sim.generate_data() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, + rand_fn_scale=rand_fn_scale + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.nobs), - "batch": np.random.randint(2, size=sim.nobs) + "condition": np.random.randint(2, size=n_cells), + "batch": np.random.randint(2, size=n_cells) }) test = de.test.wald( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=random_sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition + batch", @@ -64,59 +73,13 @@ def _test_null_distribution_wald( return True -class TestSingleNullBackendsNb(_TestSingleNullBackends, unittest.TestCase): +class TestSingleNullBackends(_TestSingleNullBackends, unittest.TestCase): """ Negative binomial noise model unit tests that test whether a test generates uniformly distributed p-values if data are sampled from the null model. """ - def test_null_distribution_wald_nb_tf1( - self, - n_cells: int = 2000, - n_genes: int = 200 - ): - """ - Test if wald() generates a uniform p-value distribution for "nb" noise model under tf1 backend - - :param n_cells: Number of cells to simulate (number of observations per test). - :param n_genes: Number of genes to simulate (number of tests). - """ - logging.getLogger("tensorflow").setLevel(logging.ERROR) - logging.getLogger("batchglm").setLevel(logging.WARNING) - logging.getLogger("diffxpy").setLevel(logging.WARNING) - - np.random.seed(1) - _ = self._test_null_distribution_wald( - n_cells=n_cells, - n_genes=n_genes, - noise_model="nb", - backend="tf1" - ) - - def test_null_distribution_wald_nb_tf2( - self, - n_cells: int = 2000, - n_genes: int = 200 - ): - """ - Test if wald() generates a uniform p-value distribution for "nb" noise model under tf2 backend - - :param n_cells: Number of cells to simulate (number of observations per test). - :param n_genes: Number of genes to simulate (number of tests). - """ - logging.getLogger("tensorflow").setLevel(logging.ERROR) - logging.getLogger("batchglm").setLevel(logging.WARNING) - logging.getLogger("diffxpy").setLevel(logging.WARNING) - - np.random.seed(1) - _ = self._test_null_distribution_wald( - n_cells=n_cells, - n_genes=n_genes, - noise_model="nb", - backend="tf2" - ) - - def test_null_distribution_wald_nb_numpy( + def test_null_distribution_wald_numpy( self, n_cells: int = 2000, n_genes: int = 200 @@ -132,12 +95,13 @@ def test_null_distribution_wald_nb_numpy( logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) - _ = self._test_null_distribution_wald( - n_cells=n_cells, - n_genes=n_genes, - noise_model="nb", - backend="numpy" - ) + for noise_model in ['poisson', 'nb', 'norm']: + _ = self._test_null_distribution_wald( + n_cells=n_cells, + n_genes=n_genes, + noise_model="nb", + backend="numpy" + ) if __name__ == '__main__': diff --git a/diffxpy/unit_test/test_constrained.py b/diffxpy/unit_test/test_constrained.py index 4144066..1ad6af0 100644 --- a/diffxpy/unit_test/test_constrained.py +++ b/diffxpy/unit_test/test_constrained.py @@ -5,7 +5,7 @@ import pandas as pd import scipy.stats as stats -from batchglm.api.models.numpy.glm_nb import Simulator +from batchglm.models.glm_nb import Model as NBModel import diffxpy.api as de @@ -25,9 +25,13 @@ def test_forfatal_from_string(self): n_cells = 2000 n_genes = 2 - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, + ) # Build design matrix: dmat = np.zeros([n_cells, 6]) @@ -41,7 +45,7 @@ def test_forfatal_from_string(self): dmat_est = pd.DataFrame(data=dmat, columns=coefficient_names) dmat_est_loc, _ = de.utils.design_matrix(dmat=dmat_est, return_type="dataframe") - dmat_est_scale, _ = de.utils.design_matrix(dmat=dmat_est, return_type="dataframe") + dmat_est_scale, _ = de.utils.design_matrix(dmat=pd.DataFrame(dmat_est['intercept']), return_type="dataframe") # Build constraints: constraints_loc = de.utils.constraint_matrix_from_string( @@ -49,14 +53,11 @@ def test_forfatal_from_string(self): coef_names=dmat_est_loc.columns, constraints=["bio1+bio2=0", "bio3+bio4=0"] ) - constraints_scale = de.utils.constraint_matrix_from_string( - dmat=dmat_est_scale.values, - coef_names=dmat_est_scale.columns, - constraints=["bio1+bio2=0", "bio3+bio4=0"] - ) + constraints_scale = None test = de.test.wald( - data=sim.input_data, + data=model.x, + gene_names=model.features, dmat_loc=dmat_est_loc, dmat_scale=dmat_est_scale, constraints_loc=constraints_loc, @@ -77,9 +78,13 @@ def test_forfatal_from_dict(self): n_cells = 2000 n_genes = 2 - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, + ) # Build design matrix: sample_description = pd.DataFrame({ @@ -88,12 +93,13 @@ def test_forfatal_from_dict(self): }) test = de.test.wald( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=sample_description, formula_loc="~1+cond+batch", - formula_scale="~1+cond+batch", + formula_scale="~1", constraints_loc={"batch": "cond"}, - constraints_scale={"batch": "cond"}, + constraints_scale=None, coef_to_test=["cond[T.cond1]"] ) _ = test.summary() @@ -115,9 +121,13 @@ def test_null_distribution_wald_constrained(self, n_genes: int = 100): np.random.seed(1) n_cells = 2000 - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, + ) # Build design matrix: sample_description = pd.DataFrame({ @@ -126,12 +136,13 @@ def test_null_distribution_wald_constrained(self, n_genes: int = 100): }) test = de.test.wald( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=sample_description, formula_loc="~1+cond+batch", - formula_scale="~1+cond+batch", + formula_scale="~1", constraints_loc={"batch": "cond"}, - constraints_scale={"batch": "cond"}, + constraints_scale=None, coef_to_test=["cond[T.cond1]"] ) _ = test.summary() @@ -161,9 +172,13 @@ def _test_null_distribution_wald_constrained_2layer(self, n_genes: int = 100): np.random.seed(1) n_cells = 12000 - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, + ) # Build design matrix: dmat = np.zeros([n_cells, 14]) @@ -191,8 +206,8 @@ def _test_null_distribution_wald_constrained_2layer(self, n_genes: int = 100): 'tech1', 'tech2', 'tech3', 'tech4'] dmat_est = pd.DataFrame(data=dmat, columns=coefficient_names) - dmat_est_loc = de.utils.design_matrix(dmat=dmat_est, return_type="dataframe") - dmat_est_scale = de.utils.design_matrix(dmat=dmat_est.iloc[:, [0]], return_type="dataframe") + dmat_est_loc, _ = de.utils.design_matrix(dmat=dmat_est, return_type="dataframe") + dmat_est_scale, _ = de.utils.design_matrix(dmat=dmat_est.iloc[:, [0]], return_type="dataframe") # Build constraints: constraints_loc = de.utils.constraint_matrix_from_string( @@ -208,7 +223,8 @@ def _test_null_distribution_wald_constrained_2layer(self, n_genes: int = 100): constraints_scale = None test = de.test.wald( - data=sim.input_data, + data=model.x, + gene_names=model.features, dmat_loc=dmat_est_loc, dmat_scale=dmat_est_scale, constraints_loc=constraints_loc, diff --git a/diffxpy/unit_test/test_continuous_de.py b/diffxpy/unit_test/test_continuous_de.py index bca5e65..66c7d4c 100644 --- a/diffxpy/unit_test/test_continuous_de.py +++ b/diffxpy/unit_test/test_continuous_de.py @@ -4,6 +4,9 @@ import diffxpy.api as de +from batchglm.models.glm_nb import Model as NBModel +from batchglm.models.glm_norm import Model as NormModel +from batchglm.models.glm_poisson import Model as PoissonModel class _TestContinuousDe: noise_model: str @@ -15,33 +18,42 @@ def _test_wald_de( ngenes: int ): if self.noise_model == "nb": - from batchglm.api.models.numpy.glm_nb import Simulator + model = NBModel() rand_fn_loc = lambda shape: np.random.uniform(2, 5, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif self.noise_model == "norm": - from batchglm.api.models.numpy.glm_norm import Simulator + model = NormModel() rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + elif self.noise_model == "poisson": + model = PoissonModel() + rand_fn_loc = lambda shape: np.random.uniform(2, 10, shape) + rand_fn_scale = None else: raise ValueError("noise model %s not recognized" % self.noise_model) n_timepoints = 7 - sim = Simulator(num_observations=n_timepoints*200, num_features=ngenes) - sim.generate_sample_description( + + num_non_de = round(ngenes / 2) + def theta_location_setter(x): + x[1:, :num_non_de] = 0 + return x + def theta_scale_setter(x): + x[1:, :] = 0 + return x + model.generate_artificial_data( + n_obs=n_timepoints*200, + n_vars=ngenes, num_batches=0, - num_conditions=n_timepoints - ) - sim.generate_params( + num_conditions=n_timepoints, rand_fn_loc=rand_fn_loc, - rand_fn_scale=rand_fn_scale + rand_fn_scale=rand_fn_scale, + theta_location_setter=theta_location_setter, + theta_scale_setter=theta_scale_setter ) - num_non_de = round(ngenes / 2) - sim.a_var[1:, :num_non_de] = 0 # Set all condition effects of non DE genes to zero. - sim.b_var[1:, :] = 0 # Use constant dispersion across all conditions. self.isDE = np.arange(ngenes) >= num_non_de - sim.generate_data() - random_sample_description = sim.sample_description + random_sample_description = model.sample_description random_sample_description["continuous"] = [int(x) for x in random_sample_description["condition"]] random_sample_description["batch"] = [ str(int(x)) + str(np.random.randint(0, 3)) @@ -49,9 +61,9 @@ def _test_wald_de( ] test = de.test.continuous_1d( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=random_sample_description, - gene_names=["gene" + str(i) for i in range(sim.input_data.num_features)], formula_loc="~ 1 + continuous + batch" if constrained else "~ 1 + continuous", formula_scale="~ 1", factor_loc_totest="continuous", @@ -63,9 +75,9 @@ def _test_wald_de( quick_scale=True, noise_model=self.noise_model ) - self._eval(sim=sim, test=test) + self._eval(model=model, test=test) - def _eval(self, sim, test): + def _eval(self, model, test): idx_de = np.where(self.isDE)[0] idx_nonde = np.where(np.logical_not(self.isDE))[0] @@ -83,7 +95,7 @@ def _eval(self, sim, test): assert frac_de_of_non_de <= 0.1, "too many false-positives, FPR=%f" % frac_de_of_non_de assert frac_de_of_de >= 0.5, "too many false-negatives, TPR=%f" % frac_de_of_de - return sim + return model def _test_wald_de_all_splines( self, @@ -135,6 +147,26 @@ def test_wald_de_norm(self): self._test_wald_de_all_splines(ngenes=100, constrained=True) return True +class TestContinuousDePoisson(_TestContinuousDe, unittest.TestCase): + """ + Normal noise model unit tests that tests false positive and false negative rates. + """ + + def test_wald_de_poisson(self): + """ + + :return: + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + self.noise_model = "poisson" + np.random.seed(1) + self._test_wald_de_all_splines(ngenes=100, constrained=False) + self._test_wald_de_all_splines(ngenes=100, constrained=True) + return True + if __name__ == '__main__': unittest.main() diff --git a/diffxpy/unit_test/test_continuous_null.py b/diffxpy/unit_test/test_continuous_null.py index 487fc7d..52aacb6 100644 --- a/diffxpy/unit_test/test_continuous_null.py +++ b/diffxpy/unit_test/test_continuous_null.py @@ -5,7 +5,7 @@ import scipy.stats as stats import logging -from batchglm.api.models.numpy.glm_nb import Simulator +from batchglm.models.glm_nb import Model as NBModel import diffxpy.api as de @@ -15,16 +15,16 @@ class _TestContinuous: def _fit_continuous( self, - sim, + model, sample_description, constrained, test, spline_basis ): test = de.test.continuous_1d( - data=sim.input_data, + data=model.x, sample_description=sample_description, - gene_names=["gene" + str(i) for i in range(sim.input_data.num_features)], + gene_names=model.features, formula_loc="~ 1 + continuous + batch" if constrained else "~ 1 + continuous", formula_scale="~ 1", factor_loc_totest="continuous", @@ -41,16 +41,16 @@ def _fit_continuous( def _fit_continuous_interaction( self, - sim, + model, sample_description, constrained, test, spline_basis ): test = de.test.continuous_1d( - data=sim.input_data, + data=model.x, sample_description=sample_description, - gene_names=["gene" + str(i) for i in range(sim.input_data.num_features)], + gene_names=model.features, formula_loc="~ 1 + continuous + condition + continuous:condition" if not constrained else \ "~ 1 + continuous + condition + continuous:condition + batch", formula_scale="~ 1", @@ -74,19 +74,23 @@ def _test_basic( spline_basis: str ): n_timepoints = 5 - sim = Simulator(num_observations=n_timepoints*200, num_features=ngenes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate_params() - sim.generate_data() + model = NBModel() + nobs = n_timepoints*200 + model.generate_artificial_data( + n_obs=nobs, + n_vars=ngenes, + num_batches=0, + num_conditions=0, + ) random_sample_description = pd.DataFrame({ - "continuous": np.asarray(np.random.randint(0, n_timepoints, size=sim.nobs), dtype=float) + "continuous": np.asarray(np.random.randint(0, n_timepoints, size=nobs), dtype=float) }) random_sample_description["batch"] = [str(int(x)) + str(np.random.randint(0, 3)) for x in random_sample_description["continuous"]] - random_sample_description["size_factors"] = np.random.uniform(0.9, 1.1, sim.nobs) # TODO put into simulation. + random_sample_description["size_factors"] = np.random.uniform(0.9, 1.1, nobs) # TODO put into simulation. det = self._fit_continuous( - sim=sim, + model=model, sample_description=random_sample_description, test=test, constrained=constrained, @@ -102,21 +106,25 @@ def _test_interaction( spline_basis: str ): n_timepoints = 5 - sim = Simulator(num_observations=n_timepoints*200, num_features=ngenes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate_params() - sim.generate_data() + model = NBModel() + nobs = n_timepoints * 200 + model.generate_artificial_data( + n_obs=nobs, + n_vars=ngenes, + num_batches=0, + num_conditions=0, + ) random_sample_description = pd.DataFrame({ - "continuous": np.asarray(np.random.randint(0, n_timepoints, size=sim.nobs), dtype=float) + "continuous": np.asarray(np.random.randint(0, n_timepoints, size=nobs), dtype=float) }) random_sample_description["condition"] = [str(np.random.randint(0, 2)) for x in random_sample_description["continuous"]] random_sample_description["batch"] = [x + str(np.random.randint(0, 3)) for x in random_sample_description["condition"]] - random_sample_description["size_factors"] = np.random.uniform(0.9, 1.1, sim.nobs) # TODO put into simulation. + random_sample_description["size_factors"] = np.random.uniform(0.9, 1.1, nobs) # TODO put into simulation. det = self._fit_continuous_interaction( - sim=sim, + model=model, sample_description=random_sample_description, test=test, constrained=constrained, diff --git a/diffxpy/unit_test/test_data_types.py b/diffxpy/unit_test/test_data_types.py index 3f82152..69a149a 100644 --- a/diffxpy/unit_test/test_data_types.py +++ b/diffxpy/unit_test/test_data_types.py @@ -6,7 +6,7 @@ import scipy.sparse import anndata -from batchglm.api.models.numpy.glm_nb import Simulator +from batchglm.models.glm_nb import Model as NBModel import diffxpy.api as de @@ -20,7 +20,7 @@ def _test_wald(self, data, sample_description, gene_names=None): factor_loc_totest="condition", formula_loc="~ 1 + condition", noise_model="nb", - batch_size=5 + batch_size=(5, 5) ) _ = test.summary() @@ -54,14 +54,18 @@ def _test_rank(self, data, sample_description, gene_names=None): _ = test.summary() def simulate(self, n_cells: int = 200, n_genes: int = 2): - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.input_data.num_observations) + "condition": np.random.randint(2, size=model.num_observations) }) - return sim.x, random_sample_description + return model.x, random_sample_description def _test_numpy(self, sparse): data, sample_description = self.simulate() diff --git a/diffxpy/unit_test/test_enrich.py b/diffxpy/unit_test/test_enrich.py index 2b79551..07e9469 100644 --- a/diffxpy/unit_test/test_enrich.py +++ b/diffxpy/unit_test/test_enrich.py @@ -1,7 +1,7 @@ import unittest import logging -from batchglm.api.models.numpy.glm_nb import Simulator +from batchglm.models.glm_nb import Model as NBModel import diffxpy.api as de @@ -14,16 +14,20 @@ def test_for_fatal(self): logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - sim = Simulator(num_observations=50, num_features=10) - sim.generate_sample_description(num_batches=0, num_conditions=2) - sim.generate() + model = NBModel() + model.generate_artificial_data( + n_obs=50, + n_vars=10, + num_batches=0, + num_conditions=2 + ) test = de.test.wald( - data=sim.X, + data=model.x, + gene_names=[str(x) for x in range(model.x.shape[1])], factor_loc_totest="condition", formula_loc="~ 1 + condition", - sample_description=sim.sample_description, - gene_names=[str(x) for x in range(sim.X.shape[1])], + sample_description=model.sample_description, training_strategy="DEFAULT", dtype="float64" ) diff --git a/diffxpy/unit_test/test_extreme_values.py b/diffxpy/unit_test/test_extreme_values.py index 36e7de9..dbcdc24 100644 --- a/diffxpy/unit_test/test_extreme_values.py +++ b/diffxpy/unit_test/test_extreme_values.py @@ -4,7 +4,7 @@ import numpy as np import pandas as pd -from batchglm.api.models.numpy.glm_nb import Simulator +from batchglm.models.glm_nb import Model as NBModel import diffxpy.api as de @@ -19,18 +19,23 @@ def test_t_test_zero_variance(self): logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) - sim = Simulator(num_observations=1000, num_features=10) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() - sim.input_data.x[:, 0] = 0 - sim.input_data.x[:, 1] = 5 + model = NBModel() + model.generate_artificial_data( + n_obs=1000, + n_vars=10, + num_batches=0, + num_conditions=0, + ) + model.x[:, 0] = 0 + model.x[:, 1] = 5 random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.nobs) + "condition": np.random.randint(2, size=1000) }) test = de.test.t_test( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=random_sample_description, grouping="condition", is_sig_zerovar=True @@ -50,18 +55,23 @@ def test_rank_test_zero_variance(self): logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) - sim = Simulator(num_observations=1000, num_features=10) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() - sim.input_data.x[:, 0] = 0 - sim.input_data.x[:, 1] = 5 + model = NBModel() + model.generate_artificial_data( + n_obs=1000, + n_vars=10, + num_batches=0, + num_conditions=0, + ) + model.x[:, 0] = 0 + model.x[:, 1] = 5 random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.nobs) + "condition": np.random.randint(2, size=1000) }) test = de.test.rank_test( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=random_sample_description, grouping="condition", is_sig_zerovar=True diff --git a/diffxpy/unit_test/test_fit.py b/diffxpy/unit_test/test_fit.py index 99c5430..980eb17 100644 --- a/diffxpy/unit_test/test_fit.py +++ b/diffxpy/unit_test/test_fit.py @@ -4,7 +4,9 @@ import pandas as pd import diffxpy.api as de - +from batchglm.models.glm_nb import Model as NBModel +from batchglm.models.glm_norm import Model as NormModel +from batchglm.models.glm_poisson import Model as PoissonModel class _TestFit: @@ -25,26 +27,33 @@ def _test_model_fit( :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": - from batchglm.api.models.numpy.glm_nb import Simulator + model = NBModel() rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": - from batchglm.api.models.numpy.glm_norm import Simulator + model = NormModel() rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + elif noise_model == "poisson": + model = PoissonModel() + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) # since it is called later but not used else: raise ValueError("noise model %s not recognized" % noise_model) - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate_params(rand_fn_scale=rand_fn_scale) - sim.generate_data() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, + rand_fn_scale=rand_fn_scale + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.nobs), - "batch": np.random.randint(2, size=sim.nobs) + "condition": np.random.randint(2, size=n_cells), + "batch": np.random.randint(2, size=n_cells) }) _ = de.fit.model( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=random_sample_description, formula_loc="~ 1 + condition + batch", noise_model=noise_model @@ -68,26 +77,33 @@ def _test_model_fit_partition( :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": - from batchglm.api.models.numpy.glm_nb import Simulator + model = NBModel() rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif noise_model == "norm": - from batchglm.api.models.numpy.glm_norm import Simulator + model = NormModel() rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + elif noise_model == "poisson": + model = PoissonModel() + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) # since it is called later but not used else: raise ValueError("noise model %s not recognized" % noise_model) - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate_params(rand_fn_scale=rand_fn_scale) - sim.generate_data() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, + rand_fn_scale=rand_fn_scale + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.nobs), - "batch": np.random.randint(2, size=sim.nobs) + "condition": np.random.randint(2, size=n_cells), + "batch": np.random.randint(2, size=n_cells) }) partition = de.fit.partition( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=random_sample_description, parts="condition" ) @@ -114,23 +130,29 @@ def _test_residuals_fit( :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": - from batchglm.api.models.numpy.glm_nb import Simulator + model = NBModel() elif noise_model == "norm": - from batchglm.api.models.numpy.glm_norm import Simulator + model = NormModel() + elif noise_model == "poisson": + model = PoissonModel() else: raise ValueError("noise model %s not recognized" % noise_model) - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.nobs), - "batch": np.random.randint(2, size=sim.nobs) + "condition": np.random.randint(2, size=n_cells), + "batch": np.random.randint(2, size=n_cells) }) res = de.fit.residuals( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=random_sample_description, formula_loc="~ 1 + condition + batch", noise_model=noise_model @@ -281,6 +303,77 @@ def test_residuals_fit( noise_model="norm" ) +class TestFitNorm(_TestFit, unittest.TestCase): + """ + Normal noise model unit tests that tests whether model fit relay works. + """ + + def test_model_fit( + self, + n_cells: int = 2000, + n_genes: int = 2 + ): + """ + Test if model fit for "norm" noise model works. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + return self._test_model_fit( + n_cells=n_cells, + n_genes=n_genes, + noise_model="poisson" + ) + + def test_model_fit_partition( + self, + n_cells: int = 2000, + n_genes: int = 2 + ): + """ + Test if partitioned model fit for "norm" noise model works. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + return self._test_model_fit_partition( + n_cells=n_cells, + n_genes=n_genes, + noise_model="poisson" + ) + + def test_residuals_fit( + self, + n_cells: int = 2000, + n_genes: int = 2 + ): + """ + Test if residual fit for "norm" noise model works. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + return self._test_residuals_fit( + n_cells=n_cells, + n_genes=n_genes, + noise_model="poisson" + ) + if __name__ == '__main__': unittest.main() diff --git a/diffxpy/unit_test/test_numeric_covar.py b/diffxpy/unit_test/test_numeric_covar.py index 2e9b2c5..cfab496 100644 --- a/diffxpy/unit_test/test_numeric_covar.py +++ b/diffxpy/unit_test/test_numeric_covar.py @@ -3,7 +3,7 @@ import numpy as np import logging -from batchglm.api.models.numpy.glm_nb import Simulator +from batchglm.models.glm_nb import Model as NBModel import diffxpy.api as de @@ -19,17 +19,21 @@ def test(self): logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - sim = Simulator(num_observations=2000, num_features=2) - sim.generate_sample_description(num_batches=0, num_conditions=2) - sim.generate_params() - sim.generate_data() + model = NBModel() + model.generate_artificial_data( + n_obs=2000, + n_vars=2, + num_batches=0, + num_conditions=2, + ) - sample_description = sim.sample_description - sample_description["numeric1"] = np.random.random(size=sim.nobs) - sample_description["numeric2"] = np.random.random(size=sim.nobs) + sample_description = model.sample_description + sample_description["numeric1"] = np.random.random(size=2000) + sample_description["numeric2"] = np.random.random(size=2000) test = de.test.wald( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=sample_description, formula_loc="~ 1 + condition + numeric1 + numeric2", formula_scale="~ 1", @@ -38,7 +42,7 @@ def test(self): training_strategy="DEFAULT" ) # Check that number of coefficients is correct. - assert test.model_estim.a_var.shape[0] == 4 + assert test.model_estim.model_container.theta_location.shape[0] == 4 return True diff --git a/diffxpy/unit_test/test_pairwise_null.py b/diffxpy/unit_test/test_pairwise_null.py index 2349758..b324aa9 100644 --- a/diffxpy/unit_test/test_pairwise_null.py +++ b/diffxpy/unit_test/test_pairwise_null.py @@ -3,6 +3,9 @@ import numpy as np import pandas as pd import scipy.stats as stats +from batchglm.models.glm_nb import Model as NBModel +from batchglm.models.glm_norm import Model as NormModel +from batchglm.models.glm_poisson import Model as PoissonModel import diffxpy.api as de @@ -17,29 +20,37 @@ def _prepate_data( n_genes: int, n_groups: int ): + + if self.noise_model == "nb": - from batchglm.api.models.numpy.glm_nb import Simulator rand_fn_loc = lambda shape: np.random.uniform(0.1, 1, shape) rand_fn_scale = lambda shape: np.random.uniform(0.5, 1, shape) + model = NBModel() elif self.noise_model == "norm" or self.noise_model is None: - from batchglm.api.models.numpy.glm_norm import Simulator rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + model = NormModel() + elif self.noise_model == "poisson": + rand_fn_loc = lambda shape: np.random.uniform(2, 10, shape) + rand_fn_scale = None + model = PoissonModel() else: raise ValueError("noise model %s not recognized" % self.noise_model) - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate_params( + + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, rand_fn_loc=rand_fn_loc, rand_fn_scale=rand_fn_scale ) - sim.generate_data() random_sample_description = pd.DataFrame({ - "condition": [str(x) for x in np.random.randint(n_groups, size=sim.nobs)] + "condition": [str(x) for x in np.random.randint(n_groups, size=n_cells)] }) - return sim, random_sample_description + return model, random_sample_description def _test_null_distribution_basic( self, @@ -59,13 +70,14 @@ def _test_null_distribution_basic( :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ - sim, sample_description = self._prepate_data( + model, sample_description = self._prepate_data( n_cells=n_cells, n_genes=n_genes, n_groups=n_groups ) det = de.test.pairwise( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=sample_description, grouping="condition", test=test, @@ -102,7 +114,7 @@ def test_null_distribution_ttest(self): logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) - self.noise_model = None + self.noise_model = "norm" self._test_null_distribution_basic(test="t-test", lazy=False) def test_null_distribution_rank(self): @@ -111,9 +123,48 @@ def test_null_distribution_rank(self): logging.getLogger("diffxpy").setLevel(logging.WARNING) np.random.seed(1) - self.noise_model = None + self.noise_model = "norm" self._test_null_distribution_basic(test="rank", lazy=False) +class TestPairwiseNullPoisson(unittest.TestCase, _TestPairwiseNull): + + def test_null_distribution_ztest(self): + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + self.noise_model = "poisson" + self._test_null_distribution_basic(test="z-test", lazy=False, quick_scale=False) + + def test_null_distribution_ztest_lazy(self): + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + self.noise_model = "poisson" + self._test_null_distribution_basic(test="z-test", lazy=True, quick_scale=False) + + def test_null_distribution_wald(self): + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + self.noise_model = "poisson" + self._test_null_distribution_basic(test="wald", lazy=False, quick_scale=False) + + def test_null_distribution_lrt(self): + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + self.noise_model = "poisson" + self._test_null_distribution_basic(test="lrt", lazy=False, quick_scale=False) + + class TestPairwiseNullNb(unittest.TestCase, _TestPairwiseNull): diff --git a/diffxpy/unit_test/test_partition.py b/diffxpy/unit_test/test_partition.py index 5423803..6717fb0 100644 --- a/diffxpy/unit_test/test_partition.py +++ b/diffxpy/unit_test/test_partition.py @@ -4,7 +4,7 @@ import pandas as pd import scipy.stats as stats -from batchglm.api.models.numpy.glm_nb import Simulator +from batchglm.models.glm_nb import Model as NBModel import diffxpy.api as de @@ -24,18 +24,22 @@ def test_null_distribution_wald(self, n_cells: int = 4000, n_genes: int = 200): logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=2) - sim.generate() - + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=2 + ) sample_description = pd.DataFrame({ - "covar1": np.random.randint(2, size=sim.nobs), - "covar2": np.random.randint(2, size=sim.nobs) + "covar1": np.random.randint(2, size=n_cells), + "covar2": np.random.randint(2, size=n_cells) }) - sample_description["cond"] = sim.sample_description["condition"].values + sample_description["cond"] = model.sample_description["condition"].values partition = de.test.partition( - data=sim.x, + data=model.x, + gene_names=model.features, parts="cond", sample_description=sample_description ) @@ -69,18 +73,22 @@ def test_null_distribution_wald_multi(self, n_cells: int = 4000, n_genes: int = logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=2) - sim.generate() - + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=2 + ) sample_description = pd.DataFrame({ - "covar1": np.random.randint(4, size=sim.nobs), - "covar2": np.random.randint(2, size=sim.nobs) + "covar1": np.random.randint(4, size=n_cells), + "covar2": np.random.randint(2, size=n_cells) }) - sample_description["cond"] = sim.sample_description["condition"].values + sample_description["cond"] = model.sample_description["condition"].values partition = de.test.partition( - data=sim.x, + data=model.x, + gene_names=model.features, parts="cond", sample_description=sample_description ) @@ -114,18 +122,22 @@ def test_null_distribution_lrt(self, n_cells: int = 4000, n_genes: int = 200): logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=2) - sim.generate() - + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=2 + ) sample_description = pd.DataFrame({ - "covar1": np.random.randint(2, size=sim.nobs), - "covar2": np.random.randint(2, size=sim.nobs) + "covar1": np.random.randint(2, size=n_cells), + "covar2": np.random.randint(2, size=n_cells) }) - sample_description["cond"] = sim.sample_description["condition"].values + sample_description["cond"] = model.sample_description["condition"].values partition = de.test.partition( - data=sim.x, + data=model.x, + gene_names=model.features, parts="cond", sample_description=sample_description ) @@ -161,17 +173,21 @@ def test_null_distribution_ttest(self, n_cells: int = 4000, n_genes: int = 200): logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=2) - sim.generate() - + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=2 + ) sample_description = pd.DataFrame({ - "covar1": np.random.randint(2, size=sim.nobs) + "covar1": np.random.randint(2, size=n_cells), }) - sample_description["cond"] = sim.sample_description["condition"].values + sample_description["cond"] = model.sample_description["condition"].values partition = de.test.partition( - data=sim.x, + data=model.x, + gene_names=model.features, parts="cond", sample_description=sample_description ) @@ -204,17 +220,21 @@ def test_null_distribution_rank(self, n_cells: int = 4000, n_genes: int = 200): logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=2) - sim.generate() - + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=2 + ) sample_description = pd.DataFrame({ - "covar1": np.random.randint(2, size=sim.nobs) + "covar1": np.random.randint(2, size=n_cells), }) - sample_description["cond"] = sim.sample_description["condition"].values + sample_description["cond"] = model.sample_description["condition"].values partition = de.test.partition( - data=sim.x, + data=model.x, + gene_names=model.features, parts="cond", sample_description=sample_description ) diff --git a/diffxpy/unit_test/test_single_de.py b/diffxpy/unit_test/test_single_de.py index d6a35f4..cb46f7e 100644 --- a/diffxpy/unit_test/test_single_de.py +++ b/diffxpy/unit_test/test_single_de.py @@ -3,6 +3,9 @@ import numpy as np import diffxpy.api as de +from batchglm.models.glm_nb import Model as NBModel +from batchglm.models.glm_norm import Model as NormModel +from batchglm.models.glm_poisson import Model as PoissonModel class _TestSingleDe: @@ -20,30 +23,41 @@ def _prepare_data( :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": - from batchglm.api.models.numpy.glm_nb import Simulator rand_fn_loc = lambda shape: np.random.uniform(5, 10, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + model = NBModel() elif noise_model == "norm": - from batchglm.api.models.numpy.glm_norm import Simulator rand_fn_loc = lambda shape: np.random.uniform(500, 1000, shape) rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + model = NormModel() + elif noise_model == "poisson": + rand_fn_loc = lambda shape: np.random.uniform(2, 10, shape) + rand_fn_scale = None # not used + model = PoissonModel() else: raise ValueError("noise model %s not recognized" % noise_model) num_non_de = n_genes // 2 - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=2) - sim.generate_params( + def theta_location_setter(x): + x[1, :num_non_de] = 0 + return x + def theta_scale_setter(x): + x[1, :num_non_de] = 0 + return x + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=2, rand_fn_loc=rand_fn_loc, - rand_fn_scale=rand_fn_scale + rand_fn_scale=rand_fn_scale, + theta_location_setter=theta_location_setter, + theta_scale_setter=theta_scale_setter, ) - sim.a_var[1, :num_non_de] = 0 - sim.b_var[1, :num_non_de] = 0 self.isDE = np.arange(n_genes) >= num_non_de - sim.generate_data() - return sim + return model - def _eval(self, sim, test): + def _eval(self, model, test): idx_de = np.where(self.isDE)[0] idx_nonde = np.where(np.logical_not(self.isDE))[0] @@ -61,7 +75,7 @@ def _eval(self, sim, test): assert frac_de_of_non_de <= 0.1, "too many false-positives %f" % frac_de_of_non_de assert frac_de_of_de >= 0.5, "too many false-negatives %f" % frac_de_of_de - return sim + return model def _test_rank_de( self, @@ -76,19 +90,20 @@ def _test_rank_de( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - sim = self._prepare_data( + model = self._prepare_data( n_cells=n_cells, n_genes=n_genes, noise_model="norm" ) test = de.test.rank_test( - data=sim.input_data, - sample_description=sim.sample_description, + data=model.x, + gene_names=model.features, + sample_description=model.sample_description, grouping="condition" ) - self._eval(sim=sim, test=test) + self._eval(model=model, test=test) return True @@ -105,19 +120,20 @@ def _test_t_test_de( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - sim = self._prepare_data( + model = self._prepare_data( n_cells=n_cells, n_genes=n_genes, noise_model="norm" ) test = de.test.t_test( - data=sim.input_data, + data=model.x, + gene_names=model.features, grouping="condition", - sample_description=sim.sample_description + sample_description=model.sample_description, ) - self._eval(sim=sim, test=test) + self._eval(model=model, test=test) return True @@ -136,15 +152,16 @@ def _test_wald_de( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - sim = self._prepare_data( + model = self._prepare_data( n_cells=n_cells, n_genes=n_genes, noise_model=noise_model ) test = de.test.wald( - data=sim.input_data, - sample_description=sim.sample_description, + data=model.x, + gene_names=model.features, + sample_description=model.sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition", noise_model=noise_model, @@ -152,7 +169,7 @@ def _test_wald_de( dtype="float64" ) - self._eval(sim=sim, test=test) + self._eval(model=model, test=test) return True @@ -171,15 +188,16 @@ def _test_wald_repeated_de( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - sim = self._prepare_data( + model = self._prepare_data( n_cells=n_cells, n_genes=n_genes, noise_model=noise_model ) test1 = de.test.wald( - data=sim.input_data, - sample_description=sim.sample_description, + data=model.x, + gene_names=model.features, + sample_description=model.sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition", noise_model=noise_model, @@ -192,7 +210,7 @@ def _test_wald_repeated_de( ) assert np.max(test.log10_pval_clean() - test1.log10_pval_clean()) < 1e-10 - self._eval(sim=sim, test=test) + self._eval(model=model, test=test) return True def _test_lrt_de( @@ -210,15 +228,16 @@ def _test_lrt_de( logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - sim = self._prepare_data( + model = self._prepare_data( n_cells=n_cells, n_genes=n_genes, noise_model=noise_model ) test = de.test.lrt( - data=sim.input_data, - sample_description=sim.sample_description, + data=model.x, + gene_names=model.features, + sample_description=model.sample_description, full_formula_loc="~ 1 + condition", full_formula_scale="~ 1", reduced_formula_loc="~ 1", @@ -228,7 +247,7 @@ def _test_lrt_de( dtype="float64" ) - self._eval(sim=sim, test=test) + self._eval(model=model, test=test) return True @@ -278,11 +297,14 @@ def test_rank_de( class TestSingleDeNb(_TestSingleDe, unittest.TestCase): + + noise_model = 'nb' + """ - Negative binomial noise model unit tests that tests false positive and false negative rates. + Negative binomial (default) noise model unit tests that tests false positive and false negative rates. """ - def test_wald_de_nb( + def test_wald_de( self, n_cells: int = 2000, n_genes: int = 200 @@ -299,7 +321,7 @@ def test_wald_de_nb( return self._test_wald_de( n_cells=n_cells, n_genes=n_genes, - noise_model="nb" + noise_model=self.noise_model ) def test_wald_repeated_de_nb( @@ -319,7 +341,7 @@ def test_wald_repeated_de_nb( return self._test_wald_repeated_de( n_cells=n_cells, n_genes=n_genes, - noise_model="nb" + noise_model=self.noise_model ) def test_lrt_de_nb( @@ -339,54 +361,14 @@ def test_lrt_de_nb( return self._test_lrt_de( n_cells=n_cells, n_genes=n_genes, - noise_model="nb" - ) - - -class TestSingleDeNorm(_TestSingleDe, unittest.TestCase): - """ - Normal noise model unit tests that tests false positive and false negative rates. - """ - - def test_wald_de_norm( - self, - n_cells: int = 2000, - n_genes: int = 200 - ): - """ - :param n_cells: Number of cells to simulate (number of observations per test). - :param n_genes: Number of genes to simulate (number of tests). - """ - logging.getLogger("tensorflow").setLevel(logging.ERROR) - logging.getLogger("batchglm").setLevel(logging.WARNING) - logging.getLogger("diffxpy").setLevel(logging.WARNING) - - np.random.seed(1) - return self._test_wald_de( - n_cells=n_cells, - n_genes=n_genes, - noise_model="norm" + noise_model=self.noise_model ) - def test_lrt_de_norm( - self, - n_cells: int = 2000, - n_genes: int = 200 - ): - """ - :param n_cells: Number of cells to simulate (number of observations per test). - :param n_genes: Number of genes to simulate (number of tests). - """ - logging.getLogger("tensorflow").setLevel(logging.ERROR) - logging.getLogger("batchglm").setLevel(logging.WARNING) - logging.getLogger("diffxpy").setLevel(logging.WARNING) +class TestSingleDePoisson(TestSingleDeNb, unittest.TestCase): + noise_model = "poisson" - np.random.seed(1) - return self._test_lrt_de( - n_cells=n_cells, - n_genes=n_genes, - noise_model="norm" - ) +class TestSingleDeNorm(TestSingleDeNb, unittest.TestCase): + noise_model = "norm" if __name__ == '__main__': diff --git a/diffxpy/unit_test/test_single_external_libs.py b/diffxpy/unit_test/test_single_external_libs.py index 6841aa5..72861f9 100644 --- a/diffxpy/unit_test/test_single_external_libs.py +++ b/diffxpy/unit_test/test_single_external_libs.py @@ -3,7 +3,7 @@ import numpy as np import scipy.stats as stats -from batchglm.api.models.numpy.glm_nb import Simulator +from batchglm.models.glm_nb import Model as NBModel import diffxpy.api as de @@ -15,17 +15,20 @@ def _prepare_data(self, n_cells: int = 2000, n_genes: int = 100): :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=2) - sim.generate_params() - sim.generate_data() + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=2, + ) - return sim + return model def _eval(self, test, ref_pvals): test_pval = test.pval pval_dev = np.abs(test_pval - ref_pvals) - log_pval_dev = np.abs(np.log(test_pval+1e-200) - np.log(ref_pvals+1e-200)) + log_pval_dev = np.abs(np.log(test_pval+1e-8) - np.log(ref_pvals+1e-8)) max_dev = np.max(pval_dev) max_log_dev = np.max(log_pval_dev) mean_dev = np.mean(log_pval_dev) @@ -56,20 +59,21 @@ def test_t_test_ref(self, n_cells: int = 2000, n_genes: int = 100): logging.getLogger("diffxpy").setLevel(logging.INFO) np.random.seed(1) - sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes) + model = self._prepare_data(n_cells=n_cells, n_genes=n_genes) test = de.test.t_test( - data=sim.input_data, + data=model.x, + gene_names=model.features, grouping="condition", - sample_description=sim.sample_description + sample_description=model.sample_description ) # Run scipy t-tests as a reference. - conds = np.unique(sim.sample_description["condition"].values) - ind_a = np.where(sim.sample_description["condition"] == conds[0])[0] - ind_b = np.where(sim.sample_description["condition"] == conds[1])[0] + conds = np.unique(model.sample_description["condition"].values) + ind_a = np.where(model.sample_description["condition"] == conds[0])[0] + ind_b = np.where(model.sample_description["condition"] == conds[1])[0] scipy_pvals = stats.ttest_ind( - a=sim.x[ind_a, :], - b=sim.x[ind_b, :], + a=model.x[ind_a, :], + b=model.x[ind_b, :], axis=0, equal_var=False ).pvalue @@ -88,25 +92,26 @@ def test_rank_ref(self, n_cells: int = 2000, n_genes: int = 100): logging.getLogger("diffxpy").setLevel(logging.INFO) np.random.seed(1) - sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes) + model = self._prepare_data(n_cells=n_cells, n_genes=n_genes) test = de.test.rank_test( - data=sim.input_data, + data=model.x, + gene_names=model.features, grouping="condition", - sample_description=sim.sample_description + sample_description=model.sample_description ) # Run scipy t-tests as a reference. - conds = np.unique(sim.sample_description["condition"].values) - ind_a = np.where(sim.sample_description["condition"] == conds[0])[0] - ind_b = np.where(sim.sample_description["condition"] == conds[1])[0] + conds = np.unique(model.sample_description["condition"].values) + ind_a = np.where(model.sample_description["condition"] == conds[0])[0] + ind_b = np.where(model.sample_description["condition"] == conds[1])[0] scipy_pvals = np.array([ stats.mannwhitneyu( - x=sim.x[ind_a, i], - y=sim.x[ind_b, i], + x=model.x[ind_a, i], + y=model.x[ind_b, i], use_continuity=True, alternative="two-sided" ).pvalue - for i in range(sim.x.shape[1]) + for i in range(model.x.shape[1]) ]) self._eval(test=test, ref_pvals=scipy_pvals) return True diff --git a/diffxpy/unit_test/test_single_fullrank.py b/diffxpy/unit_test/test_single_fullrank.py index 4cb052a..b4d79b6 100644 --- a/diffxpy/unit_test/test_single_fullrank.py +++ b/diffxpy/unit_test/test_single_fullrank.py @@ -3,6 +3,9 @@ import numpy as np import pandas as pd +from batchglm.models.glm_nb import Model as NBModel +from batchglm.models.glm_norm import Model as NormModel + import diffxpy.api as de @@ -20,29 +23,33 @@ def _test_single_full_rank(self): :param noise_model: Noise model to use for data fitting. """ if self.noise_model == "nb": - from batchglm.api.models.numpy.glm_nb import Simulator + model = NBModel() rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) elif self.noise_model == "norm": - from batchglm.api.models.numpy.glm_norm import Simulator + model = NormModel() rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) else: raise ValueError("noise model %s not recognized" % self.noise_model) - sim = Simulator(num_observations=200, num_features=2) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate_params(rand_fn_scale=rand_fn_scale) - sim.generate_data() + model.generate_artificial_data( + n_obs=200, + n_vars=2, + num_batches=0, + num_conditions=0, + rand_fn_scale=rand_fn_scale + ) random_sample_description = pd.DataFrame({ - "condition": [str(x) for x in np.random.randint(2, size=sim.nobs)] + "condition": [str(x) for x in np.random.randint(2, size=200)] }) try: random_sample_description["batch"] = random_sample_description["condition"] _ = de.test.wald( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=random_sample_description, - factor_loc_totest="condition", + factor_loc_totest="condition[T.1]", formula_loc="~ 1 + condition + batch", noise_model=self.noise_model ) @@ -56,9 +63,10 @@ def _test_single_full_rank(self): x + str(np.random.randint(0, 2)) for x in random_sample_description["condition"].values ] _ = de.test.wald( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=random_sample_description, - factor_loc_totest="condition", + factor_loc_totest="condition[T.1]", formula_loc="~ 1 + condition + batch", constraints_loc={"batch": "condition"}, noise_model=self.noise_model diff --git a/diffxpy/unit_test/test_single_null.py b/diffxpy/unit_test/test_single_null.py index e20faf5..3067432 100644 --- a/diffxpy/unit_test/test_single_null.py +++ b/diffxpy/unit_test/test_single_null.py @@ -3,6 +3,9 @@ import numpy as np import pandas as pd import scipy.stats as stats +from batchglm.models.glm_nb import Model as NBModel +from batchglm.models.glm_norm import Model as NormModel +from batchglm.models.glm_poisson import Model as PoissonModel import diffxpy.api as de @@ -26,26 +29,32 @@ def _test_null_distribution_wald( :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": - from batchglm.api.models.numpy.glm_nb import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + model = NBModel() elif noise_model == "norm": - from batchglm.api.models.numpy.glm_norm import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + model = NormModel() + elif noise_model == "poisson": + rand_fn_scale = None + model = PoissonModel() else: raise ValueError("noise model %s not recognized" % noise_model) - - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate_params(rand_fn_scale=rand_fn_scale) - sim.generate_data() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, + rand_fn_scale=rand_fn_scale + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.nobs), - "batch": np.random.randint(2, size=sim.nobs) + "condition": np.random.randint(2, size=n_cells), + "batch": np.random.randint(2, size=n_cells) }) test = de.test.wald( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=random_sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition + batch", @@ -78,26 +87,32 @@ def _test_null_distribution_wald_repeated( :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": - from batchglm.api.models.numpy.glm_nb import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + model = NBModel() elif noise_model == "norm": - from batchglm.api.models.numpy.glm_norm import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + model = NormModel() + elif noise_model == "poisson": + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) # not used + model = PoissonModel() else: raise ValueError("noise model %s not recognized" % noise_model) - - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate_params(rand_fn_scale=rand_fn_scale) - sim.generate_data() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, + rand_fn_scale=rand_fn_scale + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.nobs), - "batch": np.random.randint(2, size=sim.nobs) + "condition": np.random.randint(2, size=n_cells), + "batch": np.random.randint(2, size=n_cells) }) test1 = de.test.wald( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=random_sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition + batch", @@ -134,22 +149,26 @@ def _test_null_distribution_wald_multi( :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": - from batchglm.api.models.numpy.glm_nb import Simulator + model = NBModel() elif noise_model == "norm": - from batchglm.api.models.numpy.glm_norm import Simulator + model = NormModel() + elif noise_model == "poisson": + model = PoissonModel() else: raise ValueError("noise model %s not recognized" % noise_model) - - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() - + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(4, size=sim.nobs) + "condition": np.random.randint(4, size=n_cells) }) test = de.test.wald( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=random_sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition", @@ -182,22 +201,26 @@ def _test_null_distribution_lrt( :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": - from batchglm.api.models.numpy.glm_nb import Simulator + model = NBModel() elif noise_model == "norm": - from batchglm.api.models.numpy.glm_norm import Simulator + model = NormModel() + elif noise_model == "poisson": + model = PoissonModel() else: raise ValueError("noise model %s not recognized" % noise_model) - - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() - + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.nobs) + "condition": np.random.randint(2, size=n_cells) }) test = de.test.lrt( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=random_sample_description, full_formula_loc="~ 1 + condition", full_formula_scale="~ 1", @@ -229,18 +252,22 @@ def _test_null_distribution_ttest( :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ - from batchglm.api.models.numpy.glm_norm import Simulator + model = NormModel() - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.nobs) + "condition": np.random.randint(2, size=n_cells) }) test = de.test.t_test( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=random_sample_description, grouping="condition", is_logged=False @@ -269,18 +296,21 @@ def _test_null_distribution_rank( :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). """ - from batchglm.api.models.numpy.glm_norm import Simulator - - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + model = NormModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.nobs) + "condition": np.random.randint(2, size=n_cells) }) test = de.test.rank_test( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=random_sample_description, grouping="condition" ) @@ -344,19 +374,20 @@ def test_null_distribution_rank( ) -class TestSingleNullNb(_TestSingleNull, unittest.TestCase): +class TestSingleNullModelNb(_TestSingleNull, unittest.TestCase): + noise_model = "nb" """ - Negative binomial noise model unit tests that test whether a test generates uniformly + Negative binomial (default) noise model unit tests that test whether a test generates uniformly distributed p-values if data are sampled from the null model. """ - def test_null_distribution_wald_nb( + def test_null_distribution_wald( self, n_cells: int = 2000, n_genes: int = 200 ): """ - Test if wald() generates a uniform p-value distribution for "nb" noise model. + Test if wald() generates a uniform p-value distribution for given noise model. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). @@ -369,16 +400,16 @@ def test_null_distribution_wald_nb( return self._test_null_distribution_wald( n_cells=n_cells, n_genes=n_genes, - noise_model="nb" + noise_model=self.noise_model ) - def test_null_distribution_wald_repeated_nb( + def test_null_distribution_wald_repeated( self, n_cells: int = 2000, n_genes: int = 200 ): """ - Test if wald() generates a uniform p-value distribution for "nb" noise model. + Test if wald() generates a uniform p-value distribution for given noise model. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). @@ -391,16 +422,16 @@ def test_null_distribution_wald_repeated_nb( return self._test_null_distribution_wald_repeated( n_cells=n_cells, n_genes=n_genes, - noise_model="nb" + noise_model=self.noise_model ) - def test_null_distribution_wald_multi_nb( + def test_null_distribution_wald_multi( self, n_cells: int = 2000, n_genes: int = 200 ): """ - Test if wald() generates a uniform p-value distribution for "nb" noise model + Test if wald() generates a uniform p-value distribution for given noise model for multiple coefficients to test. :param n_cells: Number of cells to simulate (number of observations per test). @@ -414,16 +445,16 @@ def test_null_distribution_wald_multi_nb( return self._test_null_distribution_wald_multi( n_cells=n_cells, n_genes=n_genes, - noise_model="nb" + noise_model=self.noise_model ) - def test_null_distribution_lrt_nb( + def test_null_distribution_lrt( self, n_cells: int = 2000, n_genes: int = 200 ): """ - Test if lrt() generates a uniform p-value distribution for "nb" noise model. + Test if lrt() generates a uniform p-value distribution for given noise model. :param n_cells: Number of cells to simulate (number of observations per test). :param n_genes: Number of genes to simulate (number of tests). @@ -436,10 +467,14 @@ def test_null_distribution_lrt_nb( return self._test_null_distribution_lrt( n_cells=n_cells, n_genes=n_genes, - noise_model="nb" + noise_model=self.noise_model ) +class TestSingleNullPoisson(TestSingleNullModelNb, unittest.TestCase): + noise_model = "poisson" + + class TestSingleNullNorm(_TestSingleNull, unittest.TestCase): """ Normal noise model unit tests that test whether a test generates uniformly @@ -447,7 +482,7 @@ class TestSingleNullNorm(_TestSingleNull, unittest.TestCase): """ def test_null_distribution_wald_norm( self, - n_cells: int = 200, + n_cells: int = 2000, n_genes: int = 200 ): """ diff --git a/diffxpy/unit_test/test_single_sf_null.py b/diffxpy/unit_test/test_single_sf_null.py index 58c588e..da08532 100644 --- a/diffxpy/unit_test/test_single_sf_null.py +++ b/diffxpy/unit_test/test_single_sf_null.py @@ -5,7 +5,9 @@ import scipy.stats as stats import diffxpy.api as de - +from batchglm.models.glm_nb import Model as NBModel +from batchglm.models.glm_norm import Model as NormModel +from batchglm.models.glm_poisson import Model as PoissonModel class _TestSingleSfNull: @@ -26,32 +28,40 @@ def _test_null_distribution_wald( :param noise_model: Noise model to use for data fitting. """ if noise_model == "nb": - from batchglm.api.models.numpy.glm_nb import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + model = NBModel() elif noise_model == "norm": - from batchglm.api.models.numpy.glm_norm import Simulator rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + model = NormModel() + elif noise_model == "poisson": + rand_fn_scale = None + model = PoissonModel() else: raise ValueError("noise model %s not recognized" % noise_model) - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate_params(rand_fn_scale=rand_fn_scale) - sim.generate_data() + + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, + rand_fn_scale=rand_fn_scale + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.nobs), - "batch": np.random.randint(2, size=sim.nobs) + "condition": np.random.randint(2, size=n_cells), + "batch": np.random.randint(2, size=n_cells) }) - random_sf = np.random.uniform(0.5, 1.5, sim.nobs) + random_sf = np.random.uniform(0.999, 1.001, n_cells) test = de.test.wald( - data=sim.input_data, + data=model.x, + gene_names=model.features, sample_description=random_sample_description, factor_loc_totest="condition", formula_loc="~ 1 + condition + batch", size_factors=random_sf, - batch_size=500, + batch_size=(200, 200), noise_model=noise_model, training_strategy="DEFAULT", dtype="float64" @@ -103,7 +113,7 @@ class TestSingleSfNullNorm(_TestSingleSfNull, unittest.TestCase): """ def test_null_distribution_wald_norm( self, - n_cells: int = 200, + n_cells: int = 2000, n_genes: int = 200 ): """ @@ -123,6 +133,33 @@ def test_null_distribution_wald_norm( noise_model="norm" ) +class TestSingleSfNullPoisson(_TestSingleSfNull, unittest.TestCase): + """ + Normal noise model unit tests that test whether a test generates uniformly + distributed p-values if data are sampled from the null model. + """ + def test_null_distribution_wald_norm( + self, + n_cells: int = 2000, + n_genes: int = 200 + ): + """ + Test if wald() generates a uniform p-value distribution for "norm" noise model. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + np.random.seed(1) + return self._test_null_distribution_wald( + n_cells=n_cells, + n_genes=n_genes, + noise_model="poisson" + ) + if __name__ == '__main__': unittest.main() diff --git a/diffxpy/unit_test/test_twosample.py b/diffxpy/unit_test/test_twosample.py index 7cf357e..9d3ec22 100644 --- a/diffxpy/unit_test/test_twosample.py +++ b/diffxpy/unit_test/test_twosample.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd import scipy.stats as stats +from batchglm.models.glm_nb import Model as NBModel import diffxpy.api as de @@ -22,18 +23,21 @@ def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - from batchglm.api.models.numpy.glm_nb import Simulator - - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0 + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.nobs) + "condition": np.random.randint(n_groups, size=n_cells) }) test = de.test.two_sample( - data=sim.input_data, + data=model.x, + gene_names=model.features, grouping=random_sample_description["condition"].values, test="wald", noise_model="nb", @@ -61,18 +65,21 @@ def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100, n_ logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - from batchglm.api.models.numpy.glm_nb import Simulator - - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0 + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.nobs) + "condition": np.random.randint(n_groups, size=n_cells) }) test = de.test.two_sample( - data=sim.input_data, + data=model.x, + gene_names=model.features, grouping=random_sample_description["condition"], test="wald", noise_model="nb", @@ -100,18 +107,21 @@ def test_null_distribution_rank(self, n_cells: int = 2000, n_genes: int = 100, n logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - from batchglm.api.models.numpy.glm_nb import Simulator - - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0 + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.nobs) + "condition": np.random.randint(n_groups, size=n_cells) }) test = de.test.two_sample( - data=sim.input_data, + data=model.x, + gene_names=model.features, grouping=random_sample_description["condition"], test="rank" ) @@ -138,18 +148,21 @@ def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 100, logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - from batchglm.api.models.numpy.glm_nb import Simulator - - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0 + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.nobs) + "condition": np.random.randint(n_groups, size=n_cells) }) test = de.test.two_sample( - data=sim.input_data, + data=model.x, + gene_names=model.features, grouping=random_sample_description["condition"], test="t_test" ) diff --git a/diffxpy/unit_test/test_vsrest.py b/diffxpy/unit_test/test_vsrest.py index bef144a..ca24f76 100644 --- a/diffxpy/unit_test/test_vsrest.py +++ b/diffxpy/unit_test/test_vsrest.py @@ -3,13 +3,13 @@ import numpy as np import pandas as pd import scipy.stats as stats +from batchglm.models.glm_nb import Model as NBModel import diffxpy.api as de - class TestVsRest(unittest.TestCase): - - def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2): + # NOTE: This test fails sometimes, and passes other times when the groups or loc are less extreme. + def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 4): """ Test if de.test_wald_loc() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value @@ -22,23 +22,29 @@ def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100, n logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - from batchglm.api.models.numpy.glm_nb import Simulator - - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + model = NBModel() + rand_fn_loc = lambda shape: np.random.uniform(9, 10, shape) + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0, + rand_fn_loc=rand_fn_loc, + rand_fn_scale=rand_fn_scale + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.nobs) + "condition": np.random.randint(n_groups, size=n_cells) }) test = de.test.versus_rest( - data=sim.input_data, + data=model.x, + gene_names=model.features, grouping="condition", test="wald", noise_model="nb", sample_description=random_sample_description, - batch_size=500, training_strategy="DEFAULT", dtype="float64" ) @@ -65,23 +71,25 @@ def test_null_distribution_lrt(self, n_cells: int = 2000, n_genes: int = 100): logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.ERROR) - from batchglm.api.models.numpy.glm_nb import Simulator - - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0 + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(2, size=sim.nobs) + "condition": np.random.randint(2, size=n_cells) }) test = de.test.versus_rest( - data=sim.input_data, + data=model.x, + gene_names=model.features, grouping="condition", test="lrt", noise_model="nb", sample_description=random_sample_description, - batch_size=500, training_strategy="DEFAULT", dtype="float64" ) @@ -108,18 +116,21 @@ def test_null_distribution_rank(self, n_cells: int = 2000, n_genes: int = 100, n logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - from batchglm.api.models.numpy.glm_nb import Simulator - - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0 + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.nobs) + "condition": np.random.randint(n_groups, size=n_cells) }) test = de.test.versus_rest( - data=sim.input_data, + data=model.x, + gene_names=model.features, grouping="condition", test="rank", sample_description=random_sample_description, @@ -135,7 +146,8 @@ def test_null_distribution_rank(self, n_cells: int = 2000, n_genes: int = 100, n return True - def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 2): + # NOTE: This test fails sometimes, and passes other times when the groups or loc are less extreme. + def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 100, n_groups: int = 4): """ Test if de.test_wald_loc() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value @@ -148,18 +160,23 @@ def test_null_distribution_ttest(self, n_cells: int = 2000, n_genes: int = 100, logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("batchglm").setLevel(logging.WARNING) logging.getLogger("diffxpy").setLevel(logging.WARNING) - from batchglm.api.models.numpy.glm_norm import Simulator - - sim = Simulator(num_observations=n_cells, num_features=n_genes) - sim.generate_sample_description(num_batches=0, num_conditions=0) - sim.generate() + rand_fn_loc = lambda shape: np.random.uniform(9, 10, shape) + rand_fn_scale = lambda shape: np.random.uniform(1, 2, shape) + model = NBModel() + model.generate_artificial_data( + n_obs=n_cells, + n_vars=n_genes, + num_batches=0, + num_conditions=0 + ) random_sample_description = pd.DataFrame({ - "condition": np.random.randint(n_groups, size=sim.nobs) + "condition": np.random.randint(n_groups, size=n_cells) }) test = de.test.versus_rest( - data=sim.input_data, + data=model.x, + gene_names=model.features, grouping="condition", test="t-test", sample_description=random_sample_description, diff --git a/requirements.txt b/requirements.txt index 3ea27ae..89f0ec8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ sphinx_rtd_theme jinja2 docutils sparse==0.9.1 +scanpy