Skip to content

Commit

Permalink
Merge pull request #106 from jaleezyy/multi_seq
Browse files Browse the repository at this point in the history
Add FreeBayes variant calling and restore multiple pool sample support
  • Loading branch information
jaleezyy authored Mar 19, 2021
2 parents 93bc296 + 864ea01 commit c77f400
Show file tree
Hide file tree
Showing 15 changed files with 880 additions and 121 deletions.
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,16 +105,17 @@ processing and postprocessing into 'all' and 'postprocess' targets.
Related: because pipeline stages can fail, we recommend running 'snakemake all'
with the -k flag ("Go on with independent jobs if a job fails").

Additionally, you can run @jts' [ncov-tools](https://github.com/jts/ncov-tools)
Additionally, SIGNAL can prepare output for use with @jts' [ncov-tools](https://github.com/jts/ncov-tools)
to generate phylogenies and alternative summaries.

snakemake --use-conda --cores 10 ncov_tools

Signal manages installing the dependencies and will invoke `ncov-tools` if
it has been cloned as a sub-module and a fasta containing sequences to include in
the tree has been specified using `phylo_include_seqs:` in the main signal `config.yaml`.
SIGNAL manages installing the dependencies and will generate the necessary hard links to required input
files from SIGNAL for `ncov-tools` if it has been cloned as a sub-module and a fasta containing sequences
to include in the tree has been specified using `phylo_include_seqs:` in the main SIGANL`config.yaml`.

Outputs will be written as specified within the `ncov-tools` folder and documentation.
Outputs will be written as specified within the `ncov-tools` folder and documentation. At present, invoking `ncov-tools`
should be done manually as per its documentation.

### Docker

Expand Down
230 changes: 187 additions & 43 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,19 @@ validate(config, 'resources/config.schema.yaml')
samples = pd.read_table(config['samples'], sep=',')
validate(samples, 'resources/sample.schema.yaml')

# manual assignment of breseq reference
try:
if os.path.exists(config['breseq_reference']):
breseq_ref = config['breseq_reference']
else:
breseq_ref = ""
except TypeError:
breseq_ref = ""

# set output directory
exec_dir = os.getcwd()
workdir: os.path.abspath(config['result_dir'])

# throw error if duplicate sample names in table
if samples['sample'].duplicated().any():
print("Duplicate sample names in sample table, please fix and restart")
exit(1)

# get sample names
sample_names = sorted(samples['sample'].drop_duplicates().values)

Expand All @@ -66,6 +70,22 @@ def get_input_fastq_files(sample_name, r):

return os.path.abspath(os.path.join(exec_dir, relpath))

def get_pooled_fastq_files(sample_name, r):
sample_fastqs = samples[samples['sample'] == sample_name]
if r == '1':
relpath = sample_fastqs['r1_path'].values
elif r == '2':
relpath = sample_fastqs['r2_path'].values

return [ os.path.abspath(os.path.join(exec_dir, r)) for r in relpath ]

# determine raw FASTQ handling
# if duplicate sample names in table, run legacy concat_and_sort
if samples['sample'].duplicated().any():
print("Duplicate sample names in sample table. Assuming multi-lane samples exist")
ruleorder: concat_and_sort > link_raw_data
else:
ruleorder: link_raw_data > concat_and_sort

###################################### High-level targets ######################################
rule raw_read_data_symlinks:
Expand Down Expand Up @@ -96,6 +116,14 @@ rule ivar_variants:

rule breseq:
input: expand('{sn}/breseq/{sn}_output/index.html', sn=sample_names)

rule freebayes:
input:
expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names),
expand('{sn}/freebayes/{sn}.variants.norm.vcf', sn=sample_names),
'freebayes_lineage_assignments.tsv',
expand('{sn}/freebayes/quast/{sn}_quast_report.html', sn=sample_names)


rule coverage:
input: expand('{sn}/coverage/{sn}_depth.txt', sn=sample_names)
Expand All @@ -110,48 +138,60 @@ rule quast:
input: expand('{sn}/quast/{sn}_quast_report.html', sn=sample_names)

rule lineages:
input: 'lineage_assignments.tsv'
input:
'lineage_assignments.tsv'

rule config_sample_log:
input:
config_filename,
config['samples']


if config['run_breseq']:
rule all:
# to handle different options in variant calling
if config['run_breseq'] and config['run_freebayes']:
if breseq_ref == "":
print("Invalid BreSeq reference (paramter: breseq_reference) in config file. Please double check and restart")
exit(1)
rule variant_calling:
input:
rules.raw_read_data_symlinks.input,
rules.host_removed_raw_reads.input,
rules.remove_adapters.input,
rules.fastqc.input,
rules.clean_reads.input,
rules.consensus.input,
rules.breseq.input,
rules.ivar_variants.input,
rules.coverage.input,
rules.coverage_plot.input,
rules.kraken2.input,
rules.quast.input,
rules.config_sample_log.input,
rules.consensus.input,
rules.freebayes.input
elif config['run_breseq'] and not config['run_freebayes']:
if breseq_ref == "":
print("Invalid BreSeq reference (paramter: breseq_reference) in config file. Please double check and restart")
exit(1)
rule variant_calling:
input:
rules.breseq.input,
rules.lineages.input
rules.ivar_variants.input,
rules.consensus.input
elif not config['run_breseq'] and config['run_freebayes']:
rule variant_calling:
input:
rules.freebayes.input,
rules.ivar_variants.input,
rules.consensus.input
else:
rule all:
rule variant_calling:
input:
rules.raw_read_data_symlinks.input,
rules.host_removed_raw_reads.input,
rules.remove_adapters.input,
rules.fastqc.input,
rules.clean_reads.input,
rules.consensus.input,
rules.ivar_variants.input,
rules.coverage.input,
rules.coverage_plot.input,
rules.kraken2.input,
rules.quast.input,
rules.config_sample_log.input,
rules.lineages.input
rules.consensus.input

rule all:
input:
rules.raw_read_data_symlinks.input,
rules.host_removed_raw_reads.input,
rules.remove_adapters.input,
rules.fastqc.input,
rules.clean_reads.input,
rules.coverage.input,
rules.coverage_plot.input,
rules.kraken2.input,
rules.quast.input,
rules.config_sample_log.input,
rules.variant_calling.input,
rules.lineages.input

rule postprocess:
conda:
Expand Down Expand Up @@ -183,6 +223,7 @@ rule ncov_tools:


################################# Copy config and sample table to output folder ##################

rule copy_config_sample_log:
output:
config = os.path.basename(config_filename),
Expand All @@ -197,6 +238,7 @@ rule copy_config_sample_log:
"""

################################# Based on scripts/assemble.sh #################################

rule link_raw_data:
priority: 4
output:
Expand All @@ -206,6 +248,17 @@ rule link_raw_data:
shell:
'ln -s {input} {output}'

rule concat_and_sort:
priority: 4
output:
'{sn}/raw_fastq/{sn}_R{r}.fastq.gz'
input:
lambda wildcards: get_pooled_fastq_files(wildcards.sn, wildcards.r)
benchmark:
"{sn}/benchmarks/{sn}_concat_and_sort_R{r}.benchmark.tsv"
shell:
'if [ $(echo {input} | wc -w) -gt 1 ]; then zcat -f {input} | paste - - - - | sort -k1,1 -t " " | tr "\\t" "\\n" | gzip > {output}; else ln -s {input} {output}; fi'

rule run_raw_fastqc:
conda:
'conda_envs/trim_qc.yaml'
Expand All @@ -227,6 +280,7 @@ rule run_raw_fastqc:
"""

########################## Human Host Removal ################################

rule raw_reads_composite_reference_bwa_map:
threads: 2
conda:
Expand Down Expand Up @@ -354,7 +408,6 @@ rule viral_reference_bwa_map:
'samtools view -bS | samtools sort -@{threads} -o {output}) 2> {log}'



rule run_bed_primer_trim:
conda:
'conda_envs/ivar.yaml'
Expand Down Expand Up @@ -437,8 +490,8 @@ rule run_ivar_consensus:
"{sn}/benchmarks/{sn}_ivar_consensus.benchmark.tsv"
params:
mpileup_depth = config['mpileup_depth'],
ivar_min_coverage_depth = config['ivar_min_coverage_depth'],
ivar_freq_threshold = config['ivar_freq_threshold'],
ivar_min_coverage_depth = config['var_min_coverage_depth'],
ivar_freq_threshold = config['var_freq_threshold'],
output_prefix = '{sn}/core/{sn}.consensus',
shell:
'(samtools mpileup -aa -A -d {params.mpileup_depth} -Q0 {input} | '
Expand All @@ -460,7 +513,7 @@ rule index_viral_reference:
shell:
'samtools faidx {input}'


rule run_ivar_variants:
conda:
'conda_envs/ivar.yaml'
Expand All @@ -477,9 +530,9 @@ rule run_ivar_variants:
"{sn}/benchmarks/{sn}_ivar_variants.benchmark.tsv"
params:
output_prefix = '{sn}/core/{sn}_ivar_variants',
ivar_min_coverage_depth = config['ivar_min_coverage_depth'],
ivar_min_freq_threshold = config['ivar_min_freq_threshold'],
ivar_min_variant_quality = config['ivar_min_variant_quality'],
ivar_min_coverage_depth = config['var_min_coverage_depth'],
ivar_min_freq_threshold = config['var_min_freq_threshold'],
ivar_min_variant_quality = config['var_min_variant_quality'],
shell:
'(samtools mpileup -aa -A -d 0 --reference {input.reference} -B '
'-Q 0 {input.read_bam} | '
Expand All @@ -503,7 +556,7 @@ rule run_breseq:
benchmark:
"{sn}/benchmarks/{sn}_run_breseq.benchmark.tsv"
params:
ref = os.path.join(exec_dir, config['breseq_reference']),
ref = os.path.join(exec_dir, breseq_ref),
outdir = '{sn}/breseq',
unlabelled_output_dir = '{sn}/breseq/output',
labelled_output_dir = '{sn}/breseq/{sn}_output'
Expand All @@ -513,6 +566,64 @@ rule run_breseq:
mv -T {params.unlabelled_output_dir} {params.labelled_output_dir}
"""

################## Based on https://github.com/jts/ncov2019-artic-nf/blob/be26baedcc6876a798a599071bb25e0973261861/modules/illumina.nf ##################

rule run_freebayes:
threads: 1
priority: 1
conda: 'conda_envs/freebayes.yaml'
output:
consensus = '{sn}/freebayes/{sn}.consensus.fasta',
variants = '{sn}/freebayes/{sn}.variants.norm.vcf'
input:
reference = os.path.join(exec_dir, config['viral_reference_genome']),
read_bam = "{sn}/core/{sn}_viral_reference.mapping.primertrimmed.sorted.bam"
params:
out = '{sn}/freebayes/work/{sn}',
freebayes_min_coverage_depth = config['var_min_coverage_depth'],
freebayes_min_freq_threshold = config['var_min_freq_threshold'],
freebayes_min_variant_quality = config['var_min_variant_quality'],
freebayes_freq_threshold = config['var_freq_threshold'],
script_path = os.path.join(exec_dir, "scripts", "process_gvcf.py")
shell:
"""
mkdir -p $(dirname {params.out})
# the sed is to fix the header until a release is made with this fix
# https://github.com/freebayes/freebayes/pull/549
freebayes -p 1 \
-f {input.reference} \
-F 0.2 \
-C 1 \
--pooled-continuous \
--min-coverage {params.freebayes_min_coverage_depth} \
--gvcf --gvcf-dont-use-chunk true {input.read_bam} | sed s/QR,Number=1,Type=Integer/QR,Number=1,Type=Float/ > {params.out}.gvcf
# make depth mask, split variants into ambiguous/consensus
# NB: this has to happen before bcftools norm or else the depth mask misses any bases exposed during normalization
python {params.script_path} -d {params.freebayes_min_coverage_depth} \
-l {params.freebayes_min_freq_threshold} \
-u {params.freebayes_freq_threshold} \
-m {params.out}.mask.txt \
-v {params.out}.variants.vcf \
-c {params.out}.consensus.vcf {params.out}.gvcf
# normalize variant records into canonical VCF representation
bcftools norm -f {input.reference} {params.out}.variants.vcf > {output.variants}
bcftools norm -f {input.reference} {params.out}.consensus.vcf > {params.out}.consensus.norm.vcf
# split the consensus sites file into a set that should be IUPAC codes and all other bases, using the ConsensusTag in the VCF
for vt in "ambiguous" "fixed"; do
cat {params.out}.consensus.norm.vcf | awk -v vartag=ConsensusTag=$vt '$0 ~ /^#/ || $0 ~ vartag' > {params.out}.$vt.norm.vcf
bgzip -f {params.out}.$vt.norm.vcf
tabix -f -p vcf {params.out}.$vt.norm.vcf.gz
done
# apply ambiguous variants first using IUPAC codes. this variant set cannot contain indels or the subsequent step will break
bcftools consensus -f {input.reference} -I {params.out}.ambiguous.norm.vcf.gz > {params.out}.ambiguous.fasta
# apply remaninng variants, including indels
bcftools consensus -f {params.out}.ambiguous.fasta -m {params.out}.mask.txt {params.out}.fixed.norm.vcf.gz | sed s/MN908947\.3.*/{wildcards.sn}/ > {output.consensus}
"""


################## Based on scripts/hisat2.sh and scripts/coverage_stats_avg.sh ##################

Expand All @@ -538,7 +649,7 @@ rule generate_coverage_plot:
script_path = os.path.join(exec_dir, "scripts", "generate_coverage_plot.py")
shell:
"python {params.script_path} {input} {output}"

################################ Based on scripts/kraken2.sh ###################################


Expand Down Expand Up @@ -599,6 +710,26 @@ rule run_quast:
'quast {input} -r {params.genome} -g {params.fcoords} --output-dir {params.outdir} --threads {threads} >{log} && '
'for f in {params.unlabelled_reports}; do mv $f ${{f/report/{params.sample_name}}}; done'

rule run_quast_freebayes:
threads: 1
conda: 'conda_envs/assembly_qc.yaml'
output:
'{sn}/freebayes/quast/{sn}_quast_report.html'
input:
'{sn}/freebayes/{sn}.consensus.fasta'
log:
'{sn}/freebayes/quast/{sn}_quast.log'
benchmark:
"{sn}/benchmarks/{sn}_run_quast.benchmark.tsv"
params:
outdir = '{sn}/freebayes/quast',
genome = os.path.join(exec_dir, config['viral_reference_genome']),
fcoords = os.path.join(exec_dir, config['viral_reference_feature_coords']),
sample_name = '{sn}_quast_report',
unlabelled_reports = '{sn}/freebayes/quast/report.*'
shell:
'quast {input} -r {params.genome} -g {params.fcoords} --output-dir {params.outdir} --threads {threads} >{log} && '
'for f in {params.unlabelled_reports}; do mv $f ${{f/report/{params.sample_name}}}; done'

rule run_lineage_assignment:
threads: 4
Expand All @@ -612,3 +743,16 @@ rule run_lineage_assignment:
shell:
'cat {input} > all_genomes.fa && '
'{params.assignment_script_path} -i all_genomes.fa -t {threads} -o {output}'

rule run_lineage_assignment_freebayes:
threads: 4
conda: 'conda_envs/assign_lineages.yaml'
output:
'freebayes_lineage_assignments.tsv'
input:
expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names)
params:
assignment_script_path = os.path.join(exec_dir, 'scripts', 'assign_lineages.py'),
shell:
'cat {input} > all_freebayes_genomes.fa && '
'{params.assignment_script_path} -i all_freebayes_genomes.fa -t {threads} -o {output}'
10 changes: 10 additions & 0 deletions conda_envs/freebayes.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: freebayes
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- bcftools=1.9
- freebayes=1.3.2
- pysam=0.15.3
- tabix=0.2.6
Loading

0 comments on commit c77f400

Please sign in to comment.