broadinstitute · dpark01 · Mar 29, 2018 · Mar 10, 2018 · Mar 22, 2018 · Mar 22, 2018
diff --git a/ncbi.py b/ncbi.py
@@ -381,6 +381,7 @@ def fasta2fsa(infname, outdir, biosample=None):
 def make_structured_comment_file(cmt_fname, name=None, seq_tech=None, coverage=None):
  with open(cmt_fname, 'wt') as outf:
  outf.write("StructuredCommentPrefix\t##Genome-Assembly-Data-START##\n")
+ # note: the <tool name> v. <version name> format is required by NCBI, don't remove the " v. "
  outf.write("Assembly Method\tgithub.com/broadinstitute/viral-ngs v. {}\n".format(util.version.get_version()))
  if name:
  outf.write("Assembly Name\t{}\n".format(name))
@@ -448,7 +449,7 @@ def prep_genbank_files(templateFile, fasta_files, annotDir,
  outf.write(inf.readline())
  for line in inf:
  row = line.rstrip('\n').split('\t')
- if row[0] == sample_base:
+ if row[0] == sample_base or row[0] == sample:
  row[0] = sample
  outf.write('\t'.join(row) + '\n')
 

diff --git a/pipes/WDL/workflows/align_and_annot.wdl b/pipes/WDL/workflows/align_and_annot.wdl
diff --git a/pipes/WDL/workflows/coverage_table.wdl b/pipes/WDL/workflows/coverage_table.wdl
@@ -0,0 +1,5 @@
+import "reports.wdl" as reports
+
+workflow coverage_table {
+ call reports.coverage_report
+}
diff --git a/pipes/WDL/workflows/genbank.wdl b/pipes/WDL/workflows/genbank.wdl
@@ -0,0 +1,29 @@
+import "interhost.wdl" as interhost
+import "ncbi.wdl" as ncbi
+
+workflow genbank {
+
+ File reference_fasta
+ Array[File]+ assemblies_fasta # one per genome
+ Array[File]+ ref_annotations_tbl # one per chromosome
+
+ call interhost.multi_align_mafft_ref as mafft {
+ input:
+ reference_fasta = reference_fasta,
+ assemblies_fasta = assemblies_fasta
+ }
+
+ call ncbi.annot_transfer as annot {
+ input:
+ multi_aln_fasta = mafft.alignments_by_chr,
+ reference_fasta = reference_fasta,
+ reference_feature_table = ref_annotations_tbl
+ }
+
+ call ncbi.prepare_genbank as prep_genbank {
+ input:
+ assemblies_fasta = assemblies_fasta,
+ annotations_tbl = annot.transferred_feature_tables
+ }
+
+}
diff --git a/pipes/WDL/workflows/mafft.wdl b/pipes/WDL/workflows/mafft.wdl
@@ -0,0 +1,5 @@
+import "interhost.wdl" as interhost
+
+workflow mafft {
+ call interhost.multi_align_mafft
+}
diff --git a/pipes/WDL/workflows/merge_bams.wdl b/pipes/WDL/workflows/merge_bams.wdl
@@ -0,0 +1,5 @@
+import "demux.wdl" as demux
+
+workflow merge_bams {
+ call demux.merge_and_reheader_bams
+}
diff --git a/pipes/WDL/workflows/tasks/demux.wdl b/pipes/WDL/workflows/tasks/demux.wdl
@@ -107,3 +107,38 @@ task illumina_demux {
  preemptible: 0 # this is the very first operation before scatter, so let's get it done quickly & reliably
  }
 }
+
+task merge_and_reheader_bams {
+ Array[File]+ in_bams
+ File? reheader_table # tsv with 3 cols: field, old value, new value
+ String out_basename
+
+ command {
+ set -ex -o pipefail
+
+ if [ ${length(in_bams)} -gt 1 ]; then
+ read_utils.py merge_bams ${sep=' ' in_bams} merged.bam --loglevel DEBUG
+ else
+ echo "Skipping merge, only one input file"
+ ln -s ${select_first(in_bams)} merged.bam
+ fi 
+
+ if [ -f ${reheader_table} ]; then
+ read_utils.py merged.bam ${reheader_table} ${out_basename}.bam --loglevel DEBUG
+ else
+ echo "Skipping reheader, no mapping table specified"
+ ln -s merged.bam ${out_basename}.bam
+ fi
+ }
+
+ output {
+ File out_bam = "${out_basename}.bam"
+ }
+
+ runtime {
+ docker: "quay.io/broadinstitute/viral-ngs"
+ memory: "2000 MB"
+ cpu: 2
+ dx_instance_type: "mem1_ssd2_x4"
+ }
+}
diff --git a/pipes/WDL/workflows/tasks/interhost.wdl b/pipes/WDL/workflows/tasks/interhost.wdl
@@ -1,92 +1,18 @@
-task ref_guided_consensus {
- String sample
-
- File referenceGenome #fasta
- File inBam
- # TODO: input settings must compute chr_names
- Array[String] chrNames # sampleName-1, sampleName-2, ...
-
- Int? threads
- Int? minCoverage
-
- String? novoalignOptions
-
- command {
- assembly.py refine_assembly \
- "${referenceGenome}" \
- "${inBam}" \
- "${sample}.fasta" \
- --outBam "${sample}.realigned.bam" \
- --outVcf "${sample}.vcf.gz" \
- "${'--min_coverage' + minCoverage || '3'}" \
- "${'--novo_params' + novoalignOptions || '-r Random -l 40 -g 40 -x 20 -t 100'}" \
- --keep_all_reads \
- --chr_names "${sep=' ' chrNames+}" \
- "${'--threads' + threads}"
- }
-
- output {
- File assembly = "${sample}.fasta"
- File realignedBam = "${sample}.realigned.bam"
- File variantCalls = "${sample}.vcf.gz"
- }
- runtime {
- memory: "4 GB"
- docker: "quay.io/broadinstitute/viral-ngs"
- }
-}
-
-task ref_guided_consensus_aligned_with_dups {
- String sample
-
- File realignedBam
-
- command {
- samtools view -b -F 4 -o "${sample}.realigned.only_aligned.bam" "${realignedBam}"
- }
-
- output {
- File onlyAlignedReadsBam = "${sample}.realigned.only_aligned.bam"
- }
- runtime {
- memory: "8 GB"
- docker: "quay.io/broadinstitute/viral-ngs"
- }
-}
-
-#task ref_guided_diversity {
-# String sample
-#
-# File assembly # .fasta
-# File variantCalls # .vcf.gz
-# File referenceGenome # .fasta, but also .fasta.fai and .dict
-#
-# command {
-# GenomeAnalysisTK.jar
-# -T CombineVariants -R "${referenceGenome}" {inFilesString} -o {outFile}
-# --genotypemergeoption UNIQUIFY
-# }
-#
-# output {
-#
-# }
-# runtime {
-# docker: "quay.io/broadinstitute/viral-ngs"
-# }
-#}
 
 task multi_align_mafft_ref {
  File reference_fasta
  Array[File]+ assemblies_fasta # fasta files, one per sample, multiple chrs per file okay
- String? out_prefix = basename(reference_fasta, '.fasta')
+ String  out_prefix = basename(reference_fasta, '.fasta')
  Int? mafft_maxIters
- Int? mafft_ep
+ Float? mafft_ep
+ Float? mafft_gapOpeningPenalty
 
  command {
  interhost.py multichr_mafft \
  ${reference_fasta} ${sep=' ' assemblies_fasta} \
  . \
  ${'--ep' + mafft_ep} \
+ ${'--gapOpeningPenalty' + mafft_gapOpeningPenalty} \
  ${'--maxiters' + mafft_maxIters} \
  --outFilePrefix ${out_prefix} \
  --preservecase \
@@ -96,29 +22,31 @@ task multi_align_mafft_ref {
  }
 
  output {
- File sampleNamesFile = "${out_prefix}-sample_names.txt"
- Array[File] alignments_by_chr = glob("${out_prefix}*.fasta")
+ File  sampleNamesFile = "${out_prefix}-sample_names.txt"
+ Array[File]+ alignments_by_chr = glob("${out_prefix}*.fasta")
  }
 
  runtime {
  docker: "quay.io/broadinstitute/viral-ngs"
  memory: "7 GB"
- cpu: 4
- dx_instance_type: "mem1_ssd1_x4"
+ cpu: 8
+ dx_instance_type: "mem1_ssd1_x8"
  }
 }
 
 task multi_align_mafft {
  Array[File]+ assemblies_fasta # fasta files, one per sample, multiple chrs per file okay
  String out_prefix
  Int? mafft_maxIters
- Int? mafft_ep
+ Float? mafft_ep
+ Float? mafft_gapOpeningPenalty
 
  command {
  interhost.py multichr_mafft \
  ${sep=' ' assemblies_fasta} \
  . \
  ${'--ep' + mafft_ep} \
+ ${'--gapOpeningPenalty' + mafft_gapOpeningPenalty} \
  ${'--maxiters' + mafft_maxIters} \
  --outFilePrefix ${out_prefix} \
  --preservecase \
@@ -135,8 +63,8 @@ task multi_align_mafft {
  runtime {
  docker: "quay.io/broadinstitute/viral-ngs"
  memory: "7 GB"
- cpu: 4
- dx_instance_type: "mem1_ssd1_x4"
+ cpu: 8
+ dx_instance_type: "mem1_ssd1_x8"
  }
 }
 

diff --git a/pipes/WDL/workflows/tasks/ncbi.wdl b/pipes/WDL/workflows/tasks/ncbi.wdl
@@ -58,22 +58,31 @@ task download_annotations {
 }
 
 task annot_transfer {
- File chr_mutli_aln_fasta # fasta; multiple alignments of sample sequences for a single chr
- File reference_fasta # fasta (may contain multiple chrs, only one with the same name as reference_feature_table will be used)
- File reference_feature_table # feature table corresponding to the chr in the alignment
+ Array[File]+ multi_aln_fasta # fasta; multiple alignments of sample sequences for each chromosome
+ File reference_fasta # fasta; all chromosomes in one file
+ Array[File]+ reference_feature_table # tbl; feature table corresponding to each chromosome in the alignment
+
+ Array[Int] chr_nums=range(length(multi_aln_fasta))
 
  command {
- ncbi.py tbl_transfer_prealigned \
- ${chr_mutli_aln_fasta} \
- ${reference_fasta} \
- ${reference_feature_table} \
- . \
- --oob_clip \
- --loglevel DEBUG
+ set -ex -o pipefail
+ echo ${sep=' ' multi_aln_fasta} > alignments.txt
+ echo ${sep=' ' reference_feature_table} > tbls.txt
+ for i in ${sep=' ' chr_nums}; do
+ _alignment_fasta=`cat alignments.txt | cut -f $(($i+1)) -d ' '`
+ _feature_tbl=`cat tbls.txt | cut -f $(($i+1)) -d ' '`
+ ncbi.py tbl_transfer_prealigned \
+ $_alignment_fasta \
+ ${reference_fasta} \
+ $_feature_tbl \
+ . \
+ --oob_clip \
+ --loglevel DEBUG
+ done
  }
 
  output {
- Array[File] transferred_feature_tables = glob("*.tbl")
+ Array[File]+ transferred_feature_tables = glob("*.tbl")
  }
  runtime {
  docker: "quay.io/broadinstitute/viral-ngs"
@@ -87,12 +96,13 @@ task prepare_genbank {
  Array[File]+ assemblies_fasta
  Array[File]+ annotations_tbl
  File authors_sbt
- File? coverage_table # summary.assembly.txt (from Snakemake)
- File? genbankSourceTable
- File? biosampleMap
- String? sequencingTech
- String? comment
- String out_prefix = "ncbi_package"
+ File biosampleMap
+ File genbankSourceTable
+ File? coverage_table # summary.assembly.txt (from Snakemake) -- change this to accept a list of mapped bam files and we can create this table ourselves
+ String sequencingTech
+ String comment # TO DO: make this optional
+ String organism
+ String molType = "cRNA"
 
  command {
  set -ex -o pipefail
@@ -101,19 +111,25 @@ task prepare_genbank {
  ${authors_sbt} \
  ${sep=' ' assemblies_fasta} \
  . \
- ${'--master_source_table=' + genbankSourceTable} \
- ${'--sequencing_tech=' + sequencingTech} \
- ${'--biosample_map=' + biosampleMap} \
- ${'--coverage_table=' + coverage_table} \
- ${'--comment=' + comment} \
+ --mol_type ${molType} \
+ --organism "${organism}" \
+ --biosample_map ${biosampleMap} \
+ --master_source_table ${genbankSourceTable} \
+ ${'--coverage_table ' + coverage_table} \
+ --comment "${comment}" \
+ --sequencing_tech "${sequencingTech}" \
  --loglevel DEBUG
- tar -czpvf ${out_prefix}.tar.gz *.val *.cmt *.fsa *.gbf *.sqn *.src *.tbl
+ mv errorsummary.val errorsummary.val.txt # to keep it separate from the glob
  }
 
  output {
  Array[File] sequin_files = glob("*.sqn")
- File ncbi_package = "${out_prefix}.tar.gz"
- File errorSummary = "errorsummary.val"
+ Array[File] structured_comment_files = glob("*.cmt")
+ Array[File] genbank_preview_files = glob("*.gbf")
+ Array[File] source_table_files = glob("*.src")
+ Array[File] fasta_per_chr_files = glob("*.fsa")
+ Array[File] validation_files = glob("*.val")
+ File errorSummary = "errorsummary.val.txt"
  }
 
  runtime {
@@ -122,4 +138,4 @@ task prepare_genbank {
  cpu: 2
  dx_instance_type: "mem1_ssd1_x2"
  }
-}
+}