Merge pull request #106 from jaleezyy/multi_seq

Add FreeBayes variant calling and restore multiple pool sample support
jaleezyy · Mar 19, 2021 · c77f400 · c77f400
2 parents 93bc296 + 864ea01
commit c77f400
Show file tree

Hide file tree

Showing 15 changed files with 880 additions and 121 deletions.
diff --git a/README.md b/README.md
@@ -105,16 +105,17 @@ processing and postprocessing into 'all' and 'postprocess' targets.
 Related: because pipeline stages can fail, we recommend running 'snakemake all'
 with the -k flag ("Go on with independent jobs if a job fails").
 
-Additionally, you can run @jts' [ncov-tools](https://github.com/jts/ncov-tools)
+Additionally, SIGNAL can prepare output for use with @jts' [ncov-tools](https://github.com/jts/ncov-tools)
 to generate phylogenies and alternative summaries.
 
     snakemake --use-conda --cores 10 ncov_tools
 
-Signal manages installing the dependencies and will invoke `ncov-tools` if 
-it has been cloned as a sub-module and a fasta containing sequences to include in 
-the tree has been specified using `phylo_include_seqs:` in the main signal `config.yaml`.
+SIGNAL manages installing the dependencies and will generate the necessary hard links to required input
+files from SIGNAL for `ncov-tools` if it has been cloned as a sub-module and a fasta containing sequences 
+to include in the tree has been specified using `phylo_include_seqs:` in the main SIGANL`config.yaml`.
 
-Outputs will be written as specified within the `ncov-tools` folder and documentation.
+Outputs will be written as specified within the `ncov-tools` folder and documentation. At present, invoking `ncov-tools` 
+should be done manually as per its documentation. 
 
 ### Docker
 

diff --git a/Snakefile b/Snakefile
@@ -45,15 +45,19 @@ validate(config, 'resources/config.schema.yaml')
 samples = pd.read_table(config['samples'], sep=',')
 validate(samples, 'resources/sample.schema.yaml')
 
+# manual assignment of breseq reference
+try:
+    if os.path.exists(config['breseq_reference']):
+        breseq_ref = config['breseq_reference']
+    else:
+        breseq_ref = ""
+except TypeError:
+    breseq_ref = ""
+
 # set output directory
 exec_dir = os.getcwd()
 workdir: os.path.abspath(config['result_dir'])
 
-# throw error if duplicate sample names in table
-if samples['sample'].duplicated().any():
-    print("Duplicate sample names in sample table, please fix and restart")
-    exit(1)
-
 # get sample names 
 sample_names = sorted(samples['sample'].drop_duplicates().values)
 
@@ -66,6 +70,22 @@ def get_input_fastq_files(sample_name, r):
 
     return os.path.abspath(os.path.join(exec_dir, relpath))
 
+def get_pooled_fastq_files(sample_name, r):
+    sample_fastqs = samples[samples['sample'] == sample_name]
+    if r == '1':
+        relpath = sample_fastqs['r1_path'].values
+    elif r == '2':
+        relpath = sample_fastqs['r2_path'].values
+
+    return [ os.path.abspath(os.path.join(exec_dir, r)) for r in relpath ]
+
+# determine raw FASTQ handling
+# if duplicate sample names in table, run legacy concat_and_sort
+if samples['sample'].duplicated().any():
+    print("Duplicate sample names in sample table. Assuming multi-lane samples exist")
+    ruleorder: concat_and_sort > link_raw_data
+else:
+    ruleorder: link_raw_data > concat_and_sort
 
 ######################################   High-level targets   ######################################
 rule raw_read_data_symlinks:
@@ -96,6 +116,14 @@ rule ivar_variants:
 
 rule breseq:
     input: expand('{sn}/breseq/{sn}_output/index.html', sn=sample_names)
+
+rule freebayes:
+    input: 
+        expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names),
+        expand('{sn}/freebayes/{sn}.variants.norm.vcf', sn=sample_names),
+        'freebayes_lineage_assignments.tsv',
+        expand('{sn}/freebayes/quast/{sn}_quast_report.html', sn=sample_names)
+
 
 rule coverage:
     input: expand('{sn}/coverage/{sn}_depth.txt', sn=sample_names)
@@ -110,48 +138,60 @@ rule quast:
     input: expand('{sn}/quast/{sn}_quast_report.html', sn=sample_names)
 
 rule lineages:
-    input: 'lineage_assignments.tsv'
+    input: 
+        'lineage_assignments.tsv'
 
 rule config_sample_log:
     input: 
         config_filename,
         config['samples']
 
-
-if config['run_breseq']:
-    rule all:
+# to handle different options in variant calling
+if config['run_breseq'] and config['run_freebayes']:
+    if breseq_ref == "":
+        print("Invalid BreSeq reference (paramter: breseq_reference) in config file. Please double check and restart")
+        exit(1)
+    rule variant_calling:
         input:
-            rules.raw_read_data_symlinks.input,
-            rules.host_removed_raw_reads.input,
-            rules.remove_adapters.input,
-            rules.fastqc.input,
-            rules.clean_reads.input,
-            rules.consensus.input,
+            rules.breseq.input,
             rules.ivar_variants.input,
-            rules.coverage.input,
-            rules.coverage_plot.input,
-            rules.kraken2.input,
-            rules.quast.input,
-            rules.config_sample_log.input,
+            rules.consensus.input,
+            rules.freebayes.input
+elif config['run_breseq'] and not config['run_freebayes']:
+    if breseq_ref == "":
+        print("Invalid BreSeq reference (paramter: breseq_reference) in config file. Please double check and restart")
+        exit(1)
+    rule variant_calling:
+        input:
             rules.breseq.input,
-            rules.lineages.input
+            rules.ivar_variants.input,
+            rules.consensus.input
+elif not config['run_breseq'] and config['run_freebayes']:
+    rule variant_calling:
+        input:
+            rules.freebayes.input,
+            rules.ivar_variants.input,
+            rules.consensus.input
 else:
-    rule all:
+    rule variant_calling:
         input:
-            rules.raw_read_data_symlinks.input,
-            rules.host_removed_raw_reads.input,
-            rules.remove_adapters.input,
-            rules.fastqc.input,
-            rules.clean_reads.input,
-            rules.consensus.input,
             rules.ivar_variants.input,
-            rules.coverage.input,
-            rules.coverage_plot.input,
-            rules.kraken2.input,
-            rules.quast.input,
-            rules.config_sample_log.input,
-            rules.lineages.input
+            rules.consensus.input
 
+rule all:
+    input:
+        rules.raw_read_data_symlinks.input,
+        rules.host_removed_raw_reads.input,
+        rules.remove_adapters.input,
+        rules.fastqc.input,
+        rules.clean_reads.input,
+        rules.coverage.input,
+        rules.coverage_plot.input,
+        rules.kraken2.input,
+        rules.quast.input,
+        rules.config_sample_log.input,
+        rules.variant_calling.input,
+        rules.lineages.input
 
 rule postprocess:
     conda: 
@@ -183,6 +223,7 @@ rule ncov_tools:
 
 
 ################################# Copy config and sample table to output folder ##################
+
 rule copy_config_sample_log:
     output: 
         config = os.path.basename(config_filename),
@@ -197,6 +238,7 @@ rule copy_config_sample_log:
         """
 
 #################################   Based on scripts/assemble.sh   #################################
+
 rule link_raw_data:
     priority: 4
     output:
@@ -206,6 +248,17 @@ rule link_raw_data:
     shell:
         'ln -s {input} {output}'
 
+rule concat_and_sort:
+    priority: 4
+    output:
+        '{sn}/raw_fastq/{sn}_R{r}.fastq.gz'
+    input:
+        lambda wildcards: get_pooled_fastq_files(wildcards.sn, wildcards.r)
+    benchmark:
+        "{sn}/benchmarks/{sn}_concat_and_sort_R{r}.benchmark.tsv"
+    shell:
+        'if [ $(echo {input} | wc -w) -gt 1 ]; then zcat -f {input} | paste - - - - | sort -k1,1 -t " " | tr "\\t" "\\n" | gzip > {output}; else ln -s {input} {output}; fi'
+
 rule run_raw_fastqc:
     conda: 
         'conda_envs/trim_qc.yaml'
@@ -227,6 +280,7 @@ rule run_raw_fastqc:
         """
 
 ########################## Human Host Removal ################################
+
 rule raw_reads_composite_reference_bwa_map:
     threads: 2
     conda: 
@@ -354,7 +408,6 @@ rule viral_reference_bwa_map:
         'samtools view -bS | samtools sort -@{threads} -o {output}) 2> {log}'
 
 
-
 rule run_bed_primer_trim:
     conda: 
         'conda_envs/ivar.yaml'
@@ -437,8 +490,8 @@ rule run_ivar_consensus:
         "{sn}/benchmarks/{sn}_ivar_consensus.benchmark.tsv"
     params:
         mpileup_depth = config['mpileup_depth'],
-        ivar_min_coverage_depth = config['ivar_min_coverage_depth'],
-        ivar_freq_threshold = config['ivar_freq_threshold'],
+        ivar_min_coverage_depth = config['var_min_coverage_depth'],
+        ivar_freq_threshold = config['var_freq_threshold'],
         output_prefix = '{sn}/core/{sn}.consensus',
     shell:
         '(samtools mpileup -aa -A -d {params.mpileup_depth} -Q0 {input} | '
@@ -460,7 +513,7 @@ rule index_viral_reference:
     shell:
         'samtools faidx {input}'
 
-    
+
 rule run_ivar_variants:
     conda: 
         'conda_envs/ivar.yaml'
@@ -477,9 +530,9 @@ rule run_ivar_variants:
         "{sn}/benchmarks/{sn}_ivar_variants.benchmark.tsv"
     params:
         output_prefix = '{sn}/core/{sn}_ivar_variants',
-        ivar_min_coverage_depth = config['ivar_min_coverage_depth'],
-        ivar_min_freq_threshold = config['ivar_min_freq_threshold'],
-        ivar_min_variant_quality = config['ivar_min_variant_quality'],
+        ivar_min_coverage_depth = config['var_min_coverage_depth'],
+        ivar_min_freq_threshold = config['var_min_freq_threshold'],
+        ivar_min_variant_quality = config['var_min_variant_quality'],
     shell:
         '(samtools mpileup -aa -A -d 0 --reference {input.reference} -B '
             '-Q 0 {input.read_bam} | '
@@ -503,7 +556,7 @@ rule run_breseq:
     benchmark:
         "{sn}/benchmarks/{sn}_run_breseq.benchmark.tsv"
     params:
-        ref = os.path.join(exec_dir, config['breseq_reference']),
+        ref = os.path.join(exec_dir, breseq_ref),
         outdir = '{sn}/breseq',
         unlabelled_output_dir = '{sn}/breseq/output',
         labelled_output_dir = '{sn}/breseq/{sn}_output'
@@ -513,6 +566,64 @@ rule run_breseq:
         mv -T {params.unlabelled_output_dir} {params.labelled_output_dir}
         """
 
+################## Based on https://github.com/jts/ncov2019-artic-nf/blob/be26baedcc6876a798a599071bb25e0973261861/modules/illumina.nf ##################
+
+rule run_freebayes:
+    threads: 1
+    priority: 1
+    conda: 'conda_envs/freebayes.yaml'
+    output:
+        consensus = '{sn}/freebayes/{sn}.consensus.fasta',
+        variants = '{sn}/freebayes/{sn}.variants.norm.vcf'
+    input:
+        reference = os.path.join(exec_dir, config['viral_reference_genome']),
+        read_bam = "{sn}/core/{sn}_viral_reference.mapping.primertrimmed.sorted.bam"
+    params:
+        out = '{sn}/freebayes/work/{sn}',
+        freebayes_min_coverage_depth = config['var_min_coverage_depth'],
+        freebayes_min_freq_threshold = config['var_min_freq_threshold'],
+        freebayes_min_variant_quality = config['var_min_variant_quality'],
+        freebayes_freq_threshold = config['var_freq_threshold'],
+        script_path = os.path.join(exec_dir, "scripts", "process_gvcf.py")
+    shell:
+        """
+        mkdir -p $(dirname {params.out})
+        # the sed is to fix the header until a release is made with this fix
+        # https://github.com/freebayes/freebayes/pull/549
+        freebayes -p 1 \
+                  -f {input.reference} \
+                  -F 0.2 \
+                  -C 1 \
+                  --pooled-continuous \
+                  --min-coverage {params.freebayes_min_coverage_depth} \
+                  --gvcf --gvcf-dont-use-chunk true {input.read_bam} | sed s/QR,Number=1,Type=Integer/QR,Number=1,Type=Float/ > {params.out}.gvcf
+
+        # make depth mask, split variants into ambiguous/consensus
+        # NB: this has to happen before bcftools norm or else the depth mask misses any bases exposed during normalization
+        python {params.script_path} -d {params.freebayes_min_coverage_depth} \
+                        -l {params.freebayes_min_freq_threshold} \
+                        -u {params.freebayes_freq_threshold} \
+                        -m {params.out}.mask.txt \
+                        -v {params.out}.variants.vcf \
+                        -c {params.out}.consensus.vcf {params.out}.gvcf 
+
+        # normalize variant records into canonical VCF representation
+        bcftools norm -f {input.reference} {params.out}.variants.vcf > {output.variants} 
+        bcftools norm -f {input.reference} {params.out}.consensus.vcf > {params.out}.consensus.norm.vcf
+
+        # split the consensus sites file into a set that should be IUPAC codes and all other bases, using the ConsensusTag in the VCF
+        for vt in "ambiguous" "fixed"; do
+            cat {params.out}.consensus.norm.vcf | awk -v vartag=ConsensusTag=$vt '$0 ~ /^#/ || $0 ~ vartag' > {params.out}.$vt.norm.vcf
+            bgzip -f {params.out}.$vt.norm.vcf  
+            tabix -f -p vcf {params.out}.$vt.norm.vcf.gz 
+        done
+        
+        # apply ambiguous variants first using IUPAC codes. this variant set cannot contain indels or the subsequent step will break
+        bcftools consensus -f {input.reference} -I {params.out}.ambiguous.norm.vcf.gz > {params.out}.ambiguous.fasta 
+        # apply remaninng variants, including indels
+        bcftools consensus -f {params.out}.ambiguous.fasta -m {params.out}.mask.txt {params.out}.fixed.norm.vcf.gz | sed s/MN908947\.3.*/{wildcards.sn}/ > {output.consensus}
+        """
+
 
 ##################  Based on scripts/hisat2.sh and scripts/coverage_stats_avg.sh  ##################
 
@@ -538,7 +649,7 @@ rule generate_coverage_plot:
         script_path = os.path.join(exec_dir, "scripts", "generate_coverage_plot.py")
     shell:
         "python {params.script_path} {input} {output}"
-    
+
 ################################   Based on scripts/kraken2.sh   ###################################
 
 
@@ -599,6 +710,26 @@ rule run_quast:
          'quast {input} -r {params.genome} -g {params.fcoords} --output-dir {params.outdir} --threads {threads} >{log} && '
          'for f in {params.unlabelled_reports}; do mv $f ${{f/report/{params.sample_name}}}; done'
 
+rule run_quast_freebayes:
+    threads: 1
+    conda: 'conda_envs/assembly_qc.yaml'
+    output:
+         '{sn}/freebayes/quast/{sn}_quast_report.html'
+    input:
+         '{sn}/freebayes/{sn}.consensus.fasta'
+    log:
+         '{sn}/freebayes/quast/{sn}_quast.log'
+    benchmark:
+        "{sn}/benchmarks/{sn}_run_quast.benchmark.tsv"
+    params:
+         outdir = '{sn}/freebayes/quast',
+         genome = os.path.join(exec_dir, config['viral_reference_genome']),
+         fcoords = os.path.join(exec_dir, config['viral_reference_feature_coords']),
+         sample_name = '{sn}_quast_report',
+         unlabelled_reports = '{sn}/freebayes/quast/report.*'
+    shell:
+         'quast {input} -r {params.genome} -g {params.fcoords} --output-dir {params.outdir} --threads {threads} >{log} && '
+         'for f in {params.unlabelled_reports}; do mv $f ${{f/report/{params.sample_name}}}; done'
 
 rule run_lineage_assignment:
     threads: 4
@@ -612,3 +743,16 @@ rule run_lineage_assignment:
     shell:
         'cat {input} > all_genomes.fa && '
         '{params.assignment_script_path} -i all_genomes.fa -t {threads} -o {output}'
+
+rule run_lineage_assignment_freebayes:
+    threads: 4
+    conda: 'conda_envs/assign_lineages.yaml'
+    output:
+        'freebayes_lineage_assignments.tsv'
+    input:
+        expand('{sn}/freebayes/{sn}.consensus.fasta', sn=sample_names)
+    params:
+        assignment_script_path = os.path.join(exec_dir, 'scripts', 'assign_lineages.py'),
+    shell:
+        'cat {input} > all_freebayes_genomes.fa && '
+        '{params.assignment_script_path} -i all_freebayes_genomes.fa -t {threads} -o {output}'
diff --git a/conda_envs/freebayes.yaml b/conda_envs/freebayes.yaml
@@ -0,0 +1,10 @@
+name: freebayes
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bcftools=1.9
+  - freebayes=1.3.2
+  - pysam=0.15.3
+  - tabix=0.2.6