Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CheckM+PhyloPhlan integration #2

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ import tempfile
configfile: "config.yaml"

ENV = config["env"]

shell.prefix("set +u; " + ENV + "; set -u; ")

TMP_DIR_ROOT = config['tmp_dir_root']

samples = config["samples"]
sample_names=config['binning_samples']

snakefiles = os.path.join(config["software"]["snakemake_folder"],
"bin/snakefiles/")
Expand All @@ -26,6 +26,11 @@ include: snakefiles + "anvio"
include: snakefiles + "clean"
include: snakefiles + "test"
include: snakefiles + "util"
include: snakefiles + "gene_search.snake"
include: snakefiles + "phylo.snake"
include: snakefiles + "checkm.snake"

localrules: phylophlan_prep, phylophlan_post, phylophlan_prep_combined, phylophlan_post_combined

rule all:
# raw
Expand Down Expand Up @@ -57,4 +62,12 @@ rule all:
expand(anvio_dir + "{bin_sample}/{bin_sample}_samples-summary_CONCOCT.tar.gz",
bin_sample=config['binning_samples']),
expand(anvio_dir + "{bin_sample}/{bin_sample}.db.anvi_add_maxbin.done",
bin_sample=config['binning_samples'])
bin_sample=config['binning_samples']),
# CheckM
#expand(bin_dir + "{bin_sample}/checkm/checkm.done",
expand(bin_dir + "{bin_sample}/checkm/qa.tsv",
bin_sample=config['binning_samples']),
# PhyloPhlan
### Uncomment to enable phylophlan run for the combined set of bins
#bin_dir + "combined/phylo/comb_phylo.done",
expand(bin_dir + "{bin_sample}/phylo/phylo.done", bin_sample = sample_names)
17 changes: 0 additions & 17 deletions bin/install/requirements.txt

This file was deleted.

9 changes: 7 additions & 2 deletions bin/install/requirements.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@
# $ conda env create -f <this file>
name: snakemake_assemble
channels:
- bioconda
- conda-forge
- defaults
- biocore
- r
- anaconda
- bioconda
dependencies:
- graphviz=2.38.0
- python=3
Expand All @@ -21,4 +23,7 @@ dependencies:
- megahit
- spades
- maxbin2
- bbmap
- bbmap
- prodigal
- biopython
- pandas
9 changes: 9 additions & 0 deletions bin/install/requirements_phylophlan.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# This file may be used to create an environment using:
# $ conda env create -f <this file>
name: phylophlan
dependencies:
- python=2
- muscle=3.8.31
- fasttree=2.1.9
- biopython
- scipy
22 changes: 22 additions & 0 deletions bin/scripts/contig_length_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env python
import sys
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

if len(sys.argv) < 4:
print("Usage: %s <length threshold> <contigs_file> <output> [suffix]" % sys.argv[0])
print("Will filter records shorter than <length threshold> and optionally add [suffix] to the names")
sys.exit(1)

f_n = sys.argv[2]
suffix = ""
if len(sys.argv) >= 5:
suffix = "_" + sys.argv[4]

input_seq_iterator = SeqIO.parse(open(f_n, "r"), "fasta")

output_handle = open(sys.argv[3], "w")
#(record.name + suffix).replace(".", "_")
SeqIO.write((SeqRecord(record.seq, record.name + suffix, "","") for record in input_seq_iterator \
if len(record.seq) >= int(sys.argv[1])), output_handle, "fasta")
output_handle.close()
5 changes: 3 additions & 2 deletions bin/snakefiles/bin
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ rule bin_run_maxbin:
abund = bin_dir + "{bin_sample}/abundance_files/{bin_sample}_abund_list.txt",
ref = assemble_dir + "{bin_sample}/%s/{bin_sample}.contigs.simple.fa" % config['mapping_assembler']
output:
bin_dir + "{bin_sample}/maxbin/{bin_sample}.summary"
#bin_dir + "{bin_sample}/maxbin/{bin_sample}.summary"
bin_dir + "{bin_sample}/maxbin/{bin_sample}.marker"
log:
bin_dir + "logs/bin_maxbin_{bin_sample}.log"
benchmark:
Expand Down Expand Up @@ -121,4 +122,4 @@ rule bin:
expand(bin_dir + "{bin_sample}/maxbin/{bin_sample}.summary",
bin_sample=config['binning_samples']),
expand(bin_dir + "{bin_sample}/abundance_files.tar.gz",
bin_sample = config['binning_samples'])
bin_sample = config['binning_samples'])
33 changes: 33 additions & 0 deletions bin/snakefiles/checkm.snake
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@

#---- CheckM stats -------------------------------------------------------------
CHECKM_ENV = config['checkm_env']

rule checkm:
input: "{path}/{sample}/prodigal/prodigal.done"
output: "{path}/{sample}/checkm/qa.tsv"
params: in_dir="{path}/{sample}/prodigal", out_dir="{path}/{sample}/checkm"
log: "{path}/{sample}/checkm.log"
threads:
8
#TODO set up --tmpdir
shell: "set +u; {CHECKM_ENV}; set -u \n"
"rm -rf {params.out_dir}/* \n"
#"echo 'Launching checkm lineage_wf' > {log} \n"
"checkm lineage_wf -t {threads} --pplacer_threads {threads} --genes -x faa {params.in_dir} {params.out_dir} &> {log} \n"
"checkm qa -t {threads} -o 1 --tab_table -f {output} {params.out_dir}/lineage.ms {params.out_dir} &>> {log} \n"
"source deactivate"

rule parse_checkm:
input: qa="{path}/qa.tsv", tree_qa="{path}/tree_qa.tsv"
output: "{path}/checkm.tsv"
run:
import pandas as pd
table = pd.read_table(input.qa, dtype="str")
tree_table = pd.read_table(input.tree_qa, dtype="str", na_filter=False)
all_table = pd.merge(table, tree_table, on="Bin Id")
res_table = all_table[["Bin Id", "Taxonomy (contained)", "Taxonomy (sister lineage)", "Genome size (Mbp)", "Completeness", "Contamination"]].copy()
def extract_taxon(taxonomy):
return str(taxonomy).split(";")[-1]
for column in ["Taxonomy (contained)", "Taxonomy (sister lineage)"]:
res_table[column] = res_table[column].apply(extract_taxon)
res_table.to_csv(output[0], index=False, sep="\t")
42 changes: 42 additions & 0 deletions bin/snakefiles/gene_search.snake
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
def bin_names(folder):
b=[os.path.basename(f)[:-len(".fasta")] for f in glob.glob(folder + "/*.fasta")]
print("Bins in folder ", folder , " ", b)
return b

binner="maxbin"
rule genes_sample:
input:
bin_dir + "{sample}/maxbin/{sample}.summary",
lambda wc: expand(bin_dir + wc.sample + "/prodigal/{b}.faa", b=bin_names(bin_dir + wc.sample + "/" + binner))
output:
touch(bin_dir + "{sample}/prodigal/prodigal.done")

rule length_filter:
input:
"{path}/maxbin/{assembly}.fasta"
output:
"{path}/prodigal/{assembly}_{len}K.fasta"
resources:
disc = 1
log:
"{path}/prodigal/{assembly}_{len}K_filter.log"
shell:
"""contig_length_filter.py {wildcards.len}000 {input} {output} {wildcards.assembly} &> {log}"""

rule prodigal:
input:
"{path}/{b}_%dK.fasta" % config["gene_search"]["min_length"]
output:
#nuc = "{assembly}.fna",
prot = "{path}/{b}.faa",
gff = "{path}/{b}.gff"
resources:
disc = 1
params:
g = 11 #config["annotation"].get("translation_table", "11")
log:
"{path}/{b}.log"
shell:
"""prodigal -i {input} -o {output.gff} -f gff -a {output.prot} \
-g {params.g} -p meta &> {log}""" #-d {output.nuc}

76 changes: 76 additions & 0 deletions bin/snakefiles/phylo.snake
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import glob

PHYLOPHLAN_DIR=config["software"]["phylophlan_folder"]
PHYLOPHLAN_ENV = config['phylophlan_env']

rule phylophlan_prep:
input:
bin_dir + "{sample}/prodigal/prodigal.done"
output:
touch(bin_dir + "{sample}/phylo/prep.done")
shell:
"""mkdir -p {PHYLOPHLAN_DIR}/input/{wildcards.sample} \n"""
"""cp $(dirname {input})/*.faa {PHYLOPHLAN_DIR}/input/{wildcards.sample}/"""

rule phylophlan:
input:
bin_dir + "{sample}/phylo/prep.done"
output:
touch(bin_dir + "{sample}/phylo/phlan.done")
threads:
config["phylophlan"]["threads"]
log:
bin_dir + "{sample}/phylo/phylophlan.log"
shell:
"""set +u; {PHYLOPHLAN_ENV}; set -u \n"""
"""touch {log} \n"""
"""log_file=$(readlink -e {log}) \n"""
"""cd {PHYLOPHLAN_DIR} \n"""
"""python phylophlan.py -i -t {wildcards.sample} --nproc {threads} &> $log_file \n"""
"""cd - \n"""
"""source deactivate"""

rule phylophlan_post:
input:
bin_dir + "{sample}/phylo/phlan.done"
output:
touch(bin_dir + "{sample}/phylo/phylo.done")
shell:
"""cp {PHYLOPHLAN_DIR}/output/{wildcards.sample}/* $(dirname {output}) \n"""
"""rm -r {PHYLOPHLAN_DIR}/{{input,output,data}}/{wildcards.sample}"""

#todo reduce code duplication
rule phylophlan_prep_combined:
input:
expand(bin_dir + "{sample}/prodigal/prodigal.done", sample=sample_names)
output:
touch("{path}/comb_prep.done")
shell:
"""mkdir -p {PHYLOPHLAN_DIR}/input/combined \n"""
"""for f in {input} ; do cp $(dirname $f)/*.faa {PHYLOPHLAN_DIR}/input/combined/ ; done"""

rule phylophlan_combined:
input:
"{path}/comb_prep.done"
output:
touch("{path}/comb_phlan.done")
threads:
config["phylophlan"]["combined_threads"]
log:
"{path}/phylophlan.log"
shell:
"""set +u; {PHYLOPHLAN_ENV}; set -u \n"""
"""touch {log} \n"""
"""log_file=$(readlink -e {log}) \n"""
"""cd {PHYLOPHLAN_DIR} ; python phylophlan.py -i -t combined --nproc {threads} &> $log_file ; cd - \n"""
"""source deactivate"""

rule phylophlan_post_combined:
input:
"{path}/comb_phlan.done"
output:
touch("{path}/comb_phylo.done")
shell:
"""cp {PHYLOPHLAN_DIR}/output/combined/* $(dirname {output}) \n"""
"""rm -r {PHYLOPHLAN_DIR}/{{input,output,data}}/combined"""

Loading