Skip to content

Commit

Permalink
add snp matrix to snakemake
Browse files Browse the repository at this point in the history
  • Loading branch information
ktmeaton committed Mar 16, 2021
1 parent b1d03b4 commit defe53c
Show file tree
Hide file tree
Showing 16 changed files with 4,238 additions and 27 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
snp-dists 0.7.0 GCA_009669545.1_ASM966954v1_genomic GCA_009669555.1_ASM966955v1_genomic GCA_009909635.1_ASM990963v1_genomic Reference SAMN02442718 SAMN02442721 testlocal1 testlocal2
GCA_009669545.1_ASM966954v1_genomic 0 1 222 225 14 11 14 11
GCA_009669555.1_ASM966955v1_genomic 1 0 223 226 14 11 14 11
GCA_009909635.1_ASM990963v1_genomic 222 223 0 205 16 11 16 11
Reference 225 226 205 0 15 11 15 11
SAMN02442718 14 14 16 15 0 3 0 3
SAMN02442721 11 11 11 11 3 0 3 0
testlocal1 14 14 16 15 0 3 0 3
testlocal2 11 11 11 11 3 0 3 0

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
snp-dists 0.7.0 GCA_009669545.1_ASM966954v1_genomic GCA_009669555.1_ASM966955v1_genomic GCA_009909635.1_ASM990963v1_genomic Reference SAMN02442718 SAMN02442721 testlocal1 testlocal2
GCA_009669545.1_ASM966954v1_genomic 0 0 0 0 3 4 3 4
GCA_009669555.1_ASM966955v1_genomic 0 0 0 0 3 4 3 4
GCA_009909635.1_ASM990963v1_genomic 0 0 0 0 3 4 3 4
Reference 0 0 0 0 3 4 3 4
SAMN02442718 3 3 3 3 0 3 0 3
SAMN02442721 4 4 4 4 3 0 3 0
testlocal1 3 3 3 3 0 3 0 3
testlocal2 4 4 4 4 3 0 3 0

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
snp-dists 0.7.0 GCA_009669545.1_ASM966954v1_genomic GCA_009669555.1_ASM966955v1_genomic GCA_009909635.1_ASM990963v1_genomic Reference SAMN02442718 SAMN02442721 testlocal1 testlocal2
GCA_009669545.1_ASM966954v1_genomic 0 1 222 225 14 11 14 11
GCA_009669555.1_ASM966955v1_genomic 1 0 223 226 14 11 14 11
GCA_009909635.1_ASM990963v1_genomic 222 223 0 205 16 11 16 11
Reference 225 226 205 0 15 11 15 11
SAMN02442718 14 14 16 15 0 3 0 3
SAMN02442721 11 11 11 11 3 0 3 0
testlocal1 14 14 16 15 0 3 0 3
testlocal2 11 11 11 11 3 0 3 0

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
snp-dists 0.7.0 GCA_009669545.1_ASM966954v1_genomic GCA_009669555.1_ASM966955v1_genomic GCA_009909635.1_ASM990963v1_genomic Reference SAMN02442718 SAMN02442721 testlocal1 testlocal2
GCA_009669545.1_ASM966954v1_genomic 0 0 2 1 14 11 14 11
GCA_009669555.1_ASM966955v1_genomic 0 0 2 1 14 11 14 11
GCA_009909635.1_ASM990963v1_genomic 2 2 0 3 16 11 16 11
Reference 1 1 3 0 15 11 15 11
SAMN02442718 14 14 16 15 0 3 0 3
SAMN02442721 11 11 11 11 3 0 3 0
testlocal1 14 14 16 15 0 3 0 3
testlocal2 11 11 11 11 3 0 3 0

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
snp-dists 0.7.0 GCA_009669545.1_ASM966954v1_genomic GCA_009669555.1_ASM966955v1_genomic GCA_009909635.1_ASM990963v1_genomic Reference SAMN02442718 SAMN02442721 testlocal1 testlocal2
GCA_009669545.1_ASM966954v1_genomic 0 1 222 216 14 11 14 11
GCA_009669555.1_ASM966955v1_genomic 1 0 223 217 14 11 14 11
GCA_009909635.1_ASM990963v1_genomic 222 223 0 204 16 11 16 11
Reference 216 217 204 0 15 11 15 11
SAMN02442718 14 14 16 15 0 3 0 3
SAMN02442721 11 11 11 11 3 0 3 0
testlocal1 14 14 16 15 0 3 0 3
testlocal2 11 11 11 11 3 0 3 0

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
snp-dists 0.7.0 GCA_009669545.1_ASM966954v1_genomic GCA_009669555.1_ASM966955v1_genomic GCA_009909635.1_ASM990963v1_genomic Reference SAMN02442718 SAMN02442721 testlocal1 testlocal2
GCA_009669545.1_ASM966954v1_genomic 0 1 222 225 14 11 14 11
GCA_009669555.1_ASM966955v1_genomic 1 0 223 226 14 11 14 11
GCA_009909635.1_ASM990963v1_genomic 222 223 0 205 16 11 16 11
Reference 225 226 205 0 15 11 15 11
SAMN02442718 14 14 16 15 0 3 0 3
SAMN02442721 11 11 11 11 3 0 3 0
testlocal1 14 14 16 15 0 3 0 3
testlocal2 11 11 11 11 3 0 3 0

Large diffs are not rendered by default.

17 changes: 9 additions & 8 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -99,19 +99,20 @@ rule all:
"""
input:
# Metadata
metadata_all_input,
metadata_all,
# Multiqc
multiqc_all_input,
multiqc_all,
# Phylo
iqtree_scf_all_input,
iqtree_scf_all,
# Post-Phylo
clock_plot_all_input,
mugration_plot_all_input,
geo_all_input,
clock_plot_all,
mugration_plot_all,
geo_all,
# Plot
plot_missing_data_all_input,
plot_missing_data_all,
plot_snp_matrix_all,
# Stats
locus_coverage_collect_all_input,
locus_coverage_collect_all

# -----------------------------------------------------------------------------#
# Help and Usage #
Expand Down
12 changes: 12 additions & 0 deletions workflow/rules/plot.smk
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,15 @@ rule plot_missing_data:
notebook= os.path.join(logs_dir, "notebooks", "{reads_origin}", "processed_{locus_name}_plot_missing_data.py.ipynb")
notebook:
os.path.join(notebooks_dir, "plot_missing_data.py.ipynb")

rule plot_snp_matrix:
input:
aln = results_dir + "/snippy_multi/{reads_origin}/snippy-core_{locus_name}.{filter}.aln",
output:
dist = results_dir + "/snippy_multi/{reads_origin}/snippy-core_{locus_name}.{filter}.dist",
html = results_dir + "/snippy_multi/{reads_origin}/snippy-core_{locus_name}.{filter}.dist.heatmap.html",
shell:
"""
snp-dists {input.aln} > {output.dist}
python3 {scripts_dir}/plot_distance_matrix.py -i {output.dist} -o {output.html}
"""
38 changes: 38 additions & 0 deletions workflow/rules/targets.smk
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,44 @@ rule plot_missing_data_all:
input:
plot_missing_data_all_input

#------------------------------------------------------------------------------#

plot_snp_matrix_filter_all_input = expand(results_dir + "/snippy_multi/all/snippy-core_{locus_name}.snps.filter{missing_data}.dist.heatmap.html",
locus_name = config["reference_locus_name"],
missing_data=config["snippy_multi_plot_missing_data"],
)
plot_snp_matrix_full_all_input = expand(results_dir + "/snippy_multi/all/snippy-core_{locus_name}.full.dist.heatmap.html",
locus_name = config["reference_locus_name"],
)

plot_snp_matrix_filter_sra_input = [ x.replace("all", "sra") for x in plot_snp_matrix_filter_all_input ]
plot_snp_matrix_full_sra_input = [ x.replace("all", "sra") for x in plot_snp_matrix_full_all_input ]
plot_snp_matrix_filter_local_input = [ x.replace("all", "local") for x in plot_snp_matrix_filter_all_input ]
plot_snp_matrix_full_local_input = [ x.replace("all", "local") for x in plot_snp_matrix_full_all_input ]
plot_snp_matrix_filter_assembly_input = [ x.replace("all", "assembly") for x in plot_snp_matrix_filter_all_input ]
plot_snp_matrix_full_assembly_input = [ x.replace("all", "assembly") for x in plot_snp_matrix_full_all_input ]

rule plot_snp_matrix_assembly:
input:
plot_snp_matrix_filter_assembly_input,
plot_snp_matrix_full_assembly_input,

rule plot_snp_matrix_local:
input:
plot_snp_matrix_filter_local_input,
plot_snp_matrix_full_local_input,

rule plot_snp_matrix_sra:
input:
plot_snp_matrix_filter_sra_input,
plot_snp_matrix_full_sra_input,

rule plot_snp_matrix_all:
input:
plot_snp_matrix_filter_all_input,
plot_snp_matrix_full_all_input,


#------------------------------------------------------------------------------#
# Metadata
#------------------------------------------------------------------------------#
Expand Down
46 changes: 27 additions & 19 deletions workflow/scripts/plot_distance_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,29 +24,35 @@ def load_matrix(fpath: PathLike, delim: str, filter_df) -> pd.DataFrame:
matrix = []
with open(fpath) as instream:
header = next(instream).rstrip()
names = header.split(delim)[1:]
filter_names = [name for name in names if name in filter_df.index]
pretty_names = [
filter_df["Strain"][name]
+ "_"
+ str(filter_df["Date"][name]).lstrip("[").rstrip("]")
+ "_"
+ filter_df["Country"]
for name in filter_names
]

filter_i = [i for i in range(0, len(names)) if names[i] in filter_df.index]
ind_names = header.split(delim)[1:]
if filter_df:
ind_names = [name for name in ind_names if name in filter_df.index]
ind_names = [
filter_df["Strain"][name]
+ "_"
+ str(filter_df["Date"][name]).lstrip("[").rstrip("]")
+ "_"
+ filter_df["Country"]
for name in ind_names
]
filter_i = [
i for i in range(0, len(ind_names)) if ind_names[i] in filter_df.index
]

col_names = ind_names

for row in map(str.rstrip, instream):
# Filter sample rows
sample = row.split(delim)[0]
if sample not in filter_names:
if sample not in ind_names:
continue
# Filter distance column
dists = row.split(delim)[1:]
dists = [dists[i] for i in filter_i]
if filter_df:
dists = [dists[i] for i in filter_i]

matrix.append([int(d) for d in dists])
return pd.DataFrame(matrix, index=pretty_names, columns=pretty_names)
return pd.DataFrame(matrix, index=ind_names, columns=col_names)


@click.command()
Expand Down Expand Up @@ -117,10 +123,12 @@ def main(
"""This script generates an interactive heatmap (HTML) for a distance matrix."""

# Load metadata
metadata_df = pd.read_csv(metadata, sep="\t")
metadata_df.fillna("NA", inplace=True)
metadata_df.set_index("Sample", inplace=True)
filter_df = metadata_df[metadata_df[attribute] == state]
filter_df = None
if metadata:
metadata_df = pd.read_csv(metadata, sep="\t")
metadata_df.fillna("NA", inplace=True)
metadata_df.set_index("Sample", inplace=True)
filter_df = metadata_df[metadata_df[attribute] == state]

# Load distance matrix
matrix_df = load_matrix(matrix, delim, filter_df)
Expand Down

0 comments on commit defe53c

Please sign in to comment.