Skip to content

Commit

Permalink
v1.0.3 adapt to scanpy v1.10
Browse files Browse the repository at this point in the history
  • Loading branch information
jsxlei committed Apr 16, 2024
1 parent 1e3952d commit 7cef922
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 24 deletions.
2 changes: 2 additions & 0 deletions SCALEX.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
parser.add_argument('--assess', action='store_true')
parser.add_argument('--eval', action='store_true')
parser.add_argument('--num_workers', type=int, default=4)
parser.add_argument('--keep_mt', action='store_true')
# parser.add_argument('--version', type=int, default=2)
# parser.add_argument('--k', type=str, default=30)
# parser.add_argument('--embed', type=str, default='UMAP')
Expand All @@ -70,6 +71,7 @@
fraction=args.fraction,
n_obs=args.n_obs,
processed=args.processed,
keep_mt=args.keep_mt,
use_layer=args.use_layer,
backed=args.backed,
batch_size=args.batch_size,
Expand Down
2 changes: 1 addition & 1 deletion docs/source/news.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ News
=====
.. role:: small

SCALEX is online on `Nature Communications <https://www.nature.com/articles/s41467-022-33758-z>`_ :small:`2022-10-17`
SCALEX is online on `Nature Communications <https://www.nature.com/articles/s41467-022-33758-z>`_ :small:`2022-10-17`
SCALEX is available on `bioRxiv <https://www.biorxiv.org/content/10.1101/2021.04.06.438536v1>`_ :small:`2021-04-09`
18 changes: 9 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
numpy>=1.17.2
pandas>=0.25.1
scipy>=1.3.1
scikit-learn>=0.22.1
torch>=1.0.0
scanpy>=1.4.5
tqdm>=4.28.1
matplotlib>=3.0.3
seaborn>=0.9.0
numpy>=1.26.4
pandas>=2.2.2
scipy>=1.13.0
scikit-learn>=1.4.2
torch>=2.2.2
scanpy>=1.10.1
tqdm>=4.66.2
matplotlib>=3.8.4
seaborn>=0.13.2
leidenalg>=0.8.3
sphinx_autodoc_typehints
nbsphinx
2 changes: 1 addition & 1 deletion scalex/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# from pkg_resources import get_distribution

# __version__ = get_distribution('scalex').version
__version__ = '1.0.2'
__version__ = '1.0.3'
__author__ = 'Lei Xiong'
__email__ = 'jsxlei@gmail.com'

Expand Down
16 changes: 12 additions & 4 deletions scalex/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ def preprocessing_rna(
target_sum: int = 10000,
n_top_features = 2000, # or gene list
chunk_size: int = CHUNK_SIZE,
keep_mt: bool = False,
backed: bool = False,
log=None
):
Expand Down Expand Up @@ -233,8 +234,10 @@ def preprocessing_rna(
if type(adata.X) != csr.csr_matrix and (not backed) and (not adata.isbacked):
adata.X = scipy.sparse.csr_matrix(adata.X)

adata = adata[:, [gene for gene in adata.var_names
if not str(gene).startswith(tuple(['ERCC', 'MT-', 'mt-']))]]
if not keep_mt:
if log: log.info('Filtering out MT genes')
adata = adata[:, [gene for gene in adata.var_names
if not str(gene).startswith(tuple(['ERCC', 'MT-', 'mt-']))]]

if log: log.info('Filtering cells')
sc.pp.filter_cells(adata, min_genes=min_features)
Expand All @@ -251,7 +254,8 @@ def preprocessing_rna(
adata.raw = adata
if log: log.info('Finding variable features')
if type(n_top_features) == int and n_top_features>0:
sc.pp.highly_variable_genes(adata, n_top_genes=n_top_features, batch_key='batch', inplace=False, subset=True)
sc.pp.highly_variable_genes(adata, n_top_genes=n_top_features, batch_key='batch') #, inplace=False, subset=True)
adata = adata[:, adata.var.highly_variable].copy()
elif type(n_top_features) != int:
adata = reindex(adata, n_top_features)

Expand Down Expand Up @@ -344,6 +348,7 @@ def preprocessing(
min_cells: int = 3,
target_sum: int = None,
n_top_features = None, # or gene list
keep_mt: bool = False,
backed: bool = False,
chunk_size: int = CHUNK_SIZE,
log=None
Expand Down Expand Up @@ -382,6 +387,7 @@ def preprocessing(
min_cells=min_cells,
target_sum=target_sum,
n_top_features=n_top_features,
keep_mt=keep_mt,
backed=backed,
chunk_size=chunk_size,
log=log
Expand Down Expand Up @@ -482,7 +488,7 @@ def __iter__(self):
batch = {}
sampler = np.random.permutation(len(self.batch_id))
for idx in sampler:
c = self.batch_id[idx]
c = self.batch_id.iloc[idx]
if c not in batch:
batch[c] = []
batch[c].append(idx)
Expand Down Expand Up @@ -549,6 +555,7 @@ def load_data(
min_cells=3,
target_sum=None,
n_top_features=None,
keep_mt=False,
backed=False,
batch_size=64,
chunk_size=CHUNK_SIZE,
Expand Down Expand Up @@ -626,6 +633,7 @@ def load_data(
min_cells=min_cells,
target_sum=target_sum,
n_top_features=n_top_features,
keep_mt=keep_mt,
chunk_size=chunk_size,
backed=backed,
log=log,
Expand Down
27 changes: 18 additions & 9 deletions scalex/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def SCALEX(
fraction:float=None,
n_obs:int=None,
use_layer:str='X',
keep_mt:bool=False,
backed:bool=False,
batch_size:int=64,
lr:float=2e-4,
Expand All @@ -55,6 +56,7 @@ def SCALEX(
show:bool=True,
eval:bool=False,
num_workers:int=4,
cell_type:str='cell_type',
) -> AnnData:
"""
Online single-cell data integration through projecting heterogeneous datasets into a common cell-embedding space
Expand Down Expand Up @@ -153,6 +155,7 @@ def SCALEX(
backed=backed,
batch_name=batch_name,
batch_key=batch_key,
keep_mt=keep_mt,
log=log,
num_workers=num_workers,
)
Expand Down Expand Up @@ -197,6 +200,7 @@ def SCALEX(
processed=processed,
batch_name=batch_name,
batch_key=batch_key,
# keep_mt=keep_mt,
log = log,
num_workers=num_workers,
)
Expand Down Expand Up @@ -230,7 +234,8 @@ def SCALEX(

# UMAP visualization
sc.set_figure_params(dpi=80, figsize=(3,3))
cols = ['batch', 'celltype', 'cell_type', 'leiden']
cols = [cell_type, 'leiden']
cols += ['batch'] if n_domain > 1 else []
color = [c for c in cols if c in adata.obs]
if outdir:
sc.settings.figdir = outdir
Expand All @@ -243,17 +248,21 @@ def SCALEX(
embedding(adata, color='leiden', groupby='projection', save=save, show=show)
else:
sc.pl.umap(adata, color=color, save=save, wspace=0.4, ncols=4, show=show)
if assess:
if len(adata.obs['batch'].cat.categories) > 1:
entropy_score = batch_entropy_mixing_score(adata.obsm['X_umap'], adata.obs['batch'])
log.info('batch_entropy_mixing_score: {:.3f}'.format(entropy_score))

if 'celltype' in adata.obs:
sil_score = silhouette_score(adata.obsm['X_umap'], adata.obs['celltype'].cat.codes)
log.info("silhouette_score: {:.3f}".format(sil_score))

if outdir is not None:
adata.write(os.path.join(outdir, 'adata.h5ad'), compression='gzip')

if assess:
if adata.shape[0] > 5e4:
log.info('The number of cells is too large to calculate entropy_batch_mixing_score and silhouette_score')
sc.pp.subsample(adata, n_obs=int(5e4))
if len(adata.obs['batch'].cat.categories) > 1:
entropy_score = batch_entropy_mixing_score(adata.obsm['X_umap'], adata.obs['batch'])
log.info('batch_entropy_mixing_score: {:.3f}'.format(entropy_score))

if cell_type in adata.obs:
sil_score = silhouette_score(adata.obsm['X_umap'], adata.obs[cell_type].cat.codes)
log.info("silhouette_score: {:.3f}".format(sil_score))

return adata

Expand Down

0 comments on commit 7cef922

Please sign in to comment.