Skip to content

Commit

Permalink
Merge pull request #91 from danilexn/merging_issue
Browse files Browse the repository at this point in the history
  • Loading branch information
danilexn authored Apr 18, 2024
2 parents f3c56c2 + ac5709b commit 53fccd3
Show file tree
Hide file tree
Showing 12 changed files with 181 additions and 33 deletions.
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
copyright = '2021-2023, Rajewsky lab'
author = 'Tamas Ryszard Sztanka-Toth, Marvin Jens, Nikos Karaiskos, Nikolaus Rajewsky'

version = '0.7.5'
version = '0.7.6'
release = version

# -- General configuration
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = spacemake
version = 0.7.5
version = 0.7.6
author = Tamas Ryszard Sztanka-Toth, Marvin Jens, Nikos Karaiskos, Nikolaus Rajewsky
author_email = TamasRyszard.Sztanka-Toth@mdc-berlin.de
description = A bioinformatic pipeline for the analysis of spatial transcriptomic data
Expand Down
21 changes: 18 additions & 3 deletions spacemake/preprocess/dge.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
import logging

logger_name = "spacemake.preprocess.dge"
logger = logging.getLogger(logger_name)

def calculate_adata_metrics(adata, dge_summary_path=None, n_reads=None):
import scanpy as sc
import pandas as pd
Expand Down Expand Up @@ -151,7 +156,7 @@ def dge_to_sparse_adata(dge_path, dge_summary_path):
"need to add mt-missing because no mitochondrial stuff was among the genes for annotation"
)
gene_names.append("mt-missing")
X = vstack([X, np.zeros(X.shape[1])]).tocsr()
X = vstack([X, np.zeros((1, X.get_shape()[1]))]).tocsr()

# create anndata object, but we get the transpose of X, so matrix will
# be in CSC format
Expand All @@ -169,6 +174,9 @@ def dge_to_sparse_adata(dge_path, dge_summary_path):
# calculate per shannon_entropy and string_compression per bead
calculate_shannon_entropy_scompression(adata)

if adata.X.sum() == 0:
logger.warn(f"The DGE from {dge_path} is empty")

return adata


Expand Down Expand Up @@ -252,10 +260,17 @@ def attach_puck_variables(adata, puck_variables):
adata.uns["puck_variables"] = puck_variables

x_pos_max, y_pos_max = tuple(adata.obsm["spatial"].max(axis=0))
x_pos_min, y_pos_min = tuple(adata.obsm["spatial"].min(axis=0))

width_um = adata.uns["puck_variables"]["width_um"]
coord_by_um = x_pos_max / width_um
height_um = int(y_pos_max / coord_by_um)
coord_by_um = (x_pos_max - x_pos_min) / width_um

# this can be NaN if only one coordinate (only one cell, will fail)
if coord_by_um > 0:
height_um = int((y_pos_max - y_pos_min) / coord_by_um)
else:
height_um = 1 # avoid division by zero and error in reports
coord_by_um = 1

adata.uns["puck_variables"]["height_um"] = height_um
adata.uns["puck_variables"]["coord_by_um"] = coord_by_um
Expand Down
19 changes: 19 additions & 0 deletions spacemake/project_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,9 @@ def assert_valid(self):

# check that puck_barcode_file(s), R1 and R2 files exist
for n_col in ['puck_barcode_file', 'R1', 'R2']:
if n_col in ['R1', 'R2'] and row['is_merged']:
continue

if type(row[n_col]) is list:
for _it_row in row[n_col]:
if not os.path.exists(_it_row):
Expand All @@ -742,6 +745,22 @@ def assert_valid(self):
_valid_puck_coordinate = False
elif type(row['puck_barcode_file']) is str and row['puck_barcode_file'] == '':
_valid_puck_coordinate = False

# check that merging variables are properly specified
# otherwise, this throws 'MissingInputException' because it cannot find the bam files
if row['is_merged']:
if len(row['merged_from']) < 2:
raise SystemExit(SpacemakeError(f"At {index}, there is <2 samples under 'merged_from'"))
for merged_i in row['merged_from']:
if (not isinstance(merged_i, tuple) and not isinstance(merged_i, list)) or len(merged_i) != 2:
raise SystemExit(SpacemakeError(f"At {index}, wrong format for 'merged_from'.\n"+
"It must be something like "+
"[('project', 'sample_a'), ('project', 'sample_b')]"))
try:
self.assert_sample(merged_i[0], merged_i[1])
except ProjectSampleNotFoundError as e:
self.logger.error(f"Merging error at {index}")
raise e

if not _valid_puck_coordinate:
_puck_vars = self.get_puck_variables(project_id = index[0], sample_id = index[1])
Expand Down
48 changes: 43 additions & 5 deletions spacemake/smk.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,34 @@ def populate_variables_from_args(pdf, args, arg_prefix=''):

return ret

def consolidate_pucks_merged_samples():
for index, row in pdf.df.iterrows():
project_id, sample_id = index
puck_ids = row['puck_barcode_file_id']

if (not row['is_merged']) or (not pdf.is_spatial(project_id, sample_id, puck_ids)):
continue

if len(puck_ids) >= 1:
puck_ids = puck_ids[0]
elif len(puck_ids):
puck_ids = pdf.project_df_default_values['puck_barcode_file_id']

merged_from = row['merged_from']
puck_id_file = set()

for sample_tuple in merged_from:
pid = pdf.df.loc[sample_tuple]['puck_barcode_file_id']
pbf = pdf.df.loc[sample_tuple]['puck_barcode_file']
_tuple = [(id, bf) for id, bf in zip(pid, pbf)]

puck_id_file.update([ tuple(t) for t in _tuple ])

pid, pbf = list(zip(*list(puck_id_file)))
pdf.df.loc[index, 'puck_barcode_file_id'] = list(pid)
pdf.df.loc[index, 'puck_barcode_file'] = list(pbf)


def update_project_df_barcode_matches(prealigned=False):
from spacemake.snakemake.variables import puck_count_barcode_matches_summary, puck_count_prealigned_barcode_matches_summary

Expand All @@ -441,9 +469,18 @@ def update_project_df_barcode_matches(prealigned=False):
else:
_bc_file = puck_count_barcode_matches_summary

for index, _ in pdf.df.iterrows():
for index, row in pdf.df.iterrows():
project_id, sample_id = index

puck_ids = row['puck_barcode_file_id']
if len(puck_ids) >= 1:
puck_ids = puck_ids[0]
elif len(puck_ids):
puck_ids = pdf.project_df_default_values['puck_barcode_file_id']

if (row['is_merged'] and prealigned) or (not pdf.is_spatial(project_id, sample_id, puck_ids)):
continue

# check if barcodes have been filtered
_f_barcodes_df = _bc_file.format(project_id=project_id,
sample_id=sample_id)
Expand All @@ -457,11 +494,11 @@ def update_project_df_barcode_matches(prealigned=False):

above_threshold_mask = barcodes_df['pass_threshold'] == 1

_puck_barcode_files = barcodes_df[above_threshold_mask]['puck_barcode_file'].values.tolist().__str__()
_puck_barcode_files_id = barcodes_df[above_threshold_mask]['puck_barcode_file_id'].values.tolist().__str__()
_puck_barcode_files = barcodes_df[above_threshold_mask]['puck_barcode_file'].values.tolist()
_puck_barcode_files_id = barcodes_df[above_threshold_mask]['puck_barcode_file_id'].values.tolist()

pdf.df.loc[index, 'puck_barcode_file'] = _puck_barcode_files
pdf.df.loc[index, 'puck_barcode_file_id'] = _puck_barcode_files_id
pdf.df.at[index, 'puck_barcode_file'] = _puck_barcode_files
pdf.df.at[index, 'puck_barcode_file_id'] = _puck_barcode_files_id

@message_aggregation(logger_name)
def spacemake_run(pdf, args):
Expand Down Expand Up @@ -533,6 +570,7 @@ def spacemake_run(pdf, args):
# update valid pucks (above threshold) before continuing to downstream
# this performs counting of matching barcodes after alignment
update_project_df_barcode_matches(prealigned=True)
consolidate_pucks_merged_samples()
pdf.dump()

# run snakemake after prealignment filter
Expand Down
55 changes: 52 additions & 3 deletions spacemake/snakemake/main.smk
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,9 @@ rule get_stats_prealigned_barcodes:
data_root_type='complete_data',
downsampling_percentage='',
run_on_external=False,
filter_merged=False
),
get_prealignment_files(puck_count_prealigned_barcode_matches_summary)
get_prealignment_files(puck_count_prealigned_barcode_matches_summary, filter_merged=True)

rule get_whitelist_barcodes:
input:
Expand Down Expand Up @@ -541,7 +542,51 @@ rule create_mesh_spatial_dge:

rule puck_collection_stitching:
input:
unpack(get_puck_collection_stitching_input),
unpack(lambda wc: get_puck_collection_stitching_input(wc, to_mesh=False)),
# the puck_barcode_files_summary is required for puck_metadata
puck_barcode_files_summary
output:
dge_spatial_collection,
dge_spatial_collection_obs
params:
puck_data = lambda wildcards: project_df.get_puck_variables(
project_id = wildcards.project_id,
sample_id = wildcards.sample_id),
puck_metadata = lambda wildcards: project_df.get_puck_barcode_ids_and_files(
project_id=wildcards.project_id, sample_id=wildcards.sample_id
),
run:
_pc = puck_collection.merge_pucks_to_collection(
# takes all input except the puck_barcode_files
input[:-1],
params['puck_metadata'][0],
params['puck_data']['coordinate_system'],
"",
"puck_id",
)

# x_pos and y_pos to be global coordinates
_pc.obs['x_pos'] = _pc.obsm['spatial'][..., 0]
_pc.obs['y_pos'] = _pc.obsm['spatial'][..., 1]

_pc.write_h5ad(output[0])

# add 'cell_bc' name to index for same format as individual pucks
# this also ensures compatibility with qc_sequencing_create_sheet.Rmd
df = _pc.obs
df.index = np.arange(len(df))
df.index.name = "cell_bc"

# only get numeric columns, to avoid problems during summarisation
# we could implement sth like df.A.str.extract('(\d+)')
# to avoid losing information from columns that are not numeric
df._get_numeric_data().to_csv(output[1])


# TODO: collapse this with previous rule so we have a single point where we create the dge_spatial_collection
rule puck_collection_stitching_meshed:
input:
unpack(lambda wc: get_puck_collection_stitching_input(wc, to_mesh=True)),
# the puck_barcode_files_summary is required for puck_metadata
puck_barcode_files_summary
output:
Expand Down Expand Up @@ -569,10 +614,13 @@ rule puck_collection_stitching:
_pc.obs['y_pos'] = _pc.obsm['spatial'][..., 1]

_pc.write_h5ad(output[0])

# add 'cell_bc' name to index for same format as individual pucks
# this also ensures compatibility with qc_sequencing_create_sheet.Rmd
df = _pc.obs
df.index = np.arange(len(df))
df.index.name = "cell_bc"

# only get numeric columns, to avoid problems during summarisation
# we could implement sth like df.A.str.extract('(\d+)')
# to avoid losing information from columns that are not numeric
Expand Down Expand Up @@ -742,7 +790,8 @@ rule count_barcode_matches:
'matching_ratio': [matching_ratio],
})], ignore_index=True, sort=False)

above_threshold_mask = out_df.matching_ratio >= params['run_mode_variables']['spatial_barcode_min_matches']
# we use > so whenever default: 0 we exclude empty pucks
above_threshold_mask = out_df.matching_ratio > params['run_mode_variables']['spatial_barcode_min_matches']
out_df['pass_threshold'] = 0
out_df['pass_threshold'][above_threshold_mask] = 1

Expand Down
52 changes: 32 additions & 20 deletions spacemake/snakemake/scripts/snakemake_helper_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,22 +62,25 @@ def get_output_files(
project_id = project_id,
sample_id = sample_id
)

if "coordinate_system" not in puck_vars.keys():

if (len(puck_barcode_file_ids) == 0) \
or ("coordinate_system" not in puck_vars.keys()):
continue

coordinate_system = puck_vars["coordinate_system"]

if coordinate_system == '':
continue

# add the non spatial barcode by default
non_spatial_pbf_id = project_df.project_df_default_values[
"puck_barcode_file_id"
][0]
puck_barcode_file_ids = "puck_collection"

else:
# add the non spatial barcode by default
non_spatial_pbf_id = project_df.project_df_default_values[
"puck_barcode_file_id"
][0]

if non_spatial_pbf_id not in puck_barcode_file_ids:
puck_barcode_file_ids.append(non_spatial_pbf_id)
if non_spatial_pbf_id not in puck_barcode_file_ids:
puck_barcode_file_ids.append(non_spatial_pbf_id)

for run_mode in row["run_mode"]:
run_mode_variables = project_df.config.get_run_mode(run_mode).variables
Expand All @@ -88,10 +91,7 @@ def get_output_files(
polyA_adapter_trimmed = ".polyA_adapter_trimmed"
else:
polyA_adapter_trimmed = ""

if check_puck_collection:
puck_barcode_file_ids = "puck_collection"


out_files = out_files + expand(
pattern,
project_id=project_id,
Expand Down Expand Up @@ -144,9 +144,12 @@ def get_output_files_qc(
return _out_files


def get_prealignment_files(pattern):
def get_prealignment_files(pattern, filter_merged=False):
df = project_df.df

if filter_merged:
df = df.loc[~df.is_merged]

prealignment_files = []

for index, _ in df.iterrows():
Expand Down Expand Up @@ -216,17 +219,21 @@ def get_all_dges_collection(wildcards):
sample_id = sample_id
)

if "coordinate_system" not in puck_vars.keys():
puck_barcode_file_ids = project_df.get_puck_barcode_ids_and_files(
project_id, sample_id
)[0]

if (len(puck_barcode_file_ids) == 0) \
or ("coordinate_system" not in puck_vars.keys()):
continue

coordinate_system = puck_vars["coordinate_system"]

if coordinate_system == '':
continue

if not os.path.exists(coordinate_system):
raise FileNotFoundError(f"at project {project_id} sample {sample_id} "+
f"'coordinate_system' file {coordinate_system} could not be found")
f"'coordinate_system' file {coordinate_system} cannot be found")

# will consider puck_collection within all dges if a coordinate system is specified in the puck
with_puck_collection = False if not coordinate_system else True
Expand Down Expand Up @@ -612,6 +619,7 @@ def get_dge_from_run_mode(
downsampling_percentage,
puck_barcode_file_id,
only_spatial=False,
to_mesh=None
):
has_dge = project_df.has_dge(project_id=project_id, sample_id=sample_id)

Expand Down Expand Up @@ -678,9 +686,12 @@ def get_dge_from_run_mode(
if not is_spatial and not only_spatial:
dge_out_pattern = dge_out_h5ad
dge_out_summary_pattern = dge_out_h5ad_obs
elif run_mode_variables["mesh_data"]:
elif run_mode_variables["mesh_data"] or to_mesh:
dge_out_pattern = dge_spatial_mesh
dge_out_summary_pattern = dge_spatial_mesh_obs
elif to_mesh is not None and to_mesh == False:
dge_out_pattern = dge_spatial
dge_out_summary_pattern = dge_spatial_obs
else:
dge_out_pattern = dge_spatial
dge_out_summary_pattern = dge_spatial_obs
Expand Down Expand Up @@ -827,7 +838,7 @@ def get_all_barcode_readcounts(wildcards, prealigned=False):
"polyA_adapter_trimmed": polyA_adapter_trimmed_wildcard,
}

if prealigned:
if prealigned or is_merged:
return {"bc_readcounts": expand(barcode_readcounts, **extra_args)}
else:
return {"bc_readcounts": expand(barcode_readcounts_prealigned, **extra_args)}
Expand Down Expand Up @@ -929,7 +940,7 @@ def get_puck_file(wildcards):
return {"barcode_file": puck_barcode_file}


def get_puck_collection_stitching_input(wildcards):
def get_puck_collection_stitching_input(wildcards, to_mesh=False):
# there are three options:
# 1) no spatial dge
# 2) spatial dge, no mesh
Expand All @@ -948,6 +959,7 @@ def get_puck_collection_stitching_input(wildcards):
downsampling_percentage=wildcards.downsampling_percentage,
puck_barcode_file_id=puck_barcode_file_ids,
only_spatial=True,
to_mesh=to_mesh
)["dge"]][0]


Expand Down
Binary file modified test_data/test_reads.R1.fastq.gz
Binary file not shown.
Binary file modified test_data/test_reads.R2.fastq.gz
Binary file not shown.
Loading

0 comments on commit 53fccd3

Please sign in to comment.