Merge pull request #60 from AndreaGuarracino/gzip_output

added flag to compress the output files
nf-core · Mar 23, 2021 · e253556 · e253556
2 parents 4e59b8f + f0c12c0
commit e253556
Show file tree

Hide file tree

Showing 3 changed files with 98 additions and 70 deletions.
diff --git a/main.nf b/main.nf
@@ -37,31 +37,31 @@ if (!params.file_name_prefix) {
 } else if (params.file_name_prefix == "pggb") {
   // fancy naming scheme
   file_name_prefix_display = ".pggb"
-  alignment_prefix = """-\
-  ${aligner}-\
-  s${params.alignment_segment_length}-\
-  l${params.alignment_block_length}-\
-  p${params.alignment_map_pct_id}-\
-  n${params.alignment_n_secondary}-\
-  ${edyeet_align_pct_id_display}\
-  K${params.alignment_mash_kmer}\
-  ${alignment_merge_cmd}\
-  ${alignment_split_cmd}\
-  ${alignment_exclude_cmd}\
-  """.stripIndent()
-  seqwish_prefix = """\
-  .seqwish-\
-  k${params.seqwish_min_match_length}-\
-  B${params.seqwish_transclose_batch}\
-  """.stripIndent()
-  smoothxg_prefix = """${seqwish_prefix}\
-  .smoothxg-\
-  w${params.smoothxg_max_block_weight}-\
-  j${params.smoothxg_max_path_jump}-\
-  e${params.smoothxg_max_edge_jump}-\
-  I${params.smoothxg_block_id_min}-\
-  p${smoothxg_poa_params_display}-M-J0.7-K-G150\
-  """.stripIndent()
+alignment_prefix = """-\
+${aligner}-\
+s${params.alignment_segment_length}-\
+l${params.alignment_block_length}-\
+p${params.alignment_map_pct_id}-\
+n${params.alignment_n_secondary}-\
+${edyeet_align_pct_id_display}\
+K${params.alignment_mash_kmer}\
+${alignment_merge_cmd}\
+${alignment_split_cmd}\
+${alignment_exclude_cmd}\
+"""
+seqwish_prefix = """${alignment_prefix}\
+.seqwish-\
+k${params.seqwish_min_match_length}-\
+B${params.seqwish_transclose_batch}\
+"""
+smoothxg_prefix = """${seqwish_prefix}\
+.smoothxg-\
+w${params.smoothxg_max_block_weight}-\
+j${params.smoothxg_max_path_jump}-\
+e${params.smoothxg_max_edge_jump}-\
+I${params.smoothxg_block_id_min}-\
+p${smoothxg_poa_params_display}-M-J0.7-K-G150\
+"""
 } else {
   // take the given prefix
   file_name_prefix_display= "${params.file_name_prefix}.pggb"
@@ -86,11 +86,13 @@ if (!params.file_name_prefix || params.file_name_prefix == "pggb") {
 }
 
 process edyeet {
+  publishDir "${params.outdir}/alignment", mode: "${params.publish_dir_mode}"
+
   input:
-  tuple val(f), path(fasta)
+    tuple val(f), path(fasta)
 
   output:
-  tuple val(f), path(fasta), path("${f}${alignment_prefix}.paf")
+    tuple val(f), path("${f}${alignment_prefix}.paf")
 
   """
   edyeet ${alignment_exclude_cmd} \
@@ -104,16 +106,18 @@ process edyeet {
      -k ${params.alignment_mash_kmer} \
      -t ${task.cpus} \
      $fasta $fasta \
-     >${f}${alignment_prefix}.paf 
+     >${f}${alignment_prefix}.paf
   """
 }
 
 process wfmash {
+  publishDir "${params.outdir}/alignment", mode: "${params.publish_dir_mode}"
+
   input:
-  tuple val(f), path(fasta)
+    tuple val(f), path(fasta)
 
   output:
-  tuple val(f), path(fasta), path("${f}${alignment_prefix}.paf")
+    tuple val(f), path("${f}${alignment_prefix}.paf")
 
   """
   wfmash ${alignment_exclude_cmd} \
@@ -126,15 +130,16 @@ process wfmash {
      -k ${params.alignment_mash_kmer} \
      -t ${task.cpus} \
      $fasta $fasta \
-     >${f}${alignment_prefix}.paf 
+     >${f}${alignment_prefix}.paf
   """
 }
 
 process seqwish {
   publishDir "${params.outdir}/seqwish", mode: "${params.publish_dir_mode}"
 
   input:
-  tuple val(f), path(fasta), path(alignment)
+    tuple val(f), path(fasta)
+    path(alignment)
 
   output:
     tuple val(f), path("${f}${seqwish_prefix}.gfa")
@@ -186,11 +191,13 @@ process smoothxg {
 }
 
 process odgiBuild {
+  publishDir "${params.outdir}/odgi_build", mode: "${params.publish_dir_mode}"
+
   input:
-  path(graph)
+    path(graph)
 
   output:
-  path("${graph}.og")
+    path("${graph}.og")
 
   """
   odgi build -g $graph -o ${graph}.og -P -t ${task.cpus}
@@ -200,11 +207,11 @@ process odgiBuild {
 process odgiStats {
   publishDir "${params.outdir}/odgi_stats", mode: "${params.publish_dir_mode}"
 
-  input: 
-  path(graph)
+  input:
+    path(graph)
 
   output:
-  path("${graph}.stats")
+    path("${graph}.stats")
 
   """
   odgi stats -i "${graph}" -S -s -d -l > "${graph}.stats" 2>&1
@@ -215,10 +222,10 @@ process odgiViz {
   publishDir "${params.outdir}/odgi_viz", mode: "${params.publish_dir_mode}"
 
   input:
-  path(graph)
+    path(graph)
 
   output:
-  path("${graph}.viz_mqc.png")
+    path("${graph}.viz_mqc.png")
 
   """
   odgi viz \
@@ -275,6 +282,20 @@ process odgiDraw {
   """
 }
 
+process pigzOutputFiles {
+  publishDir "${params.outdir}/compressed_outputs", mode: "${params.publish_dir_mode}"
+
+  input:
+    path(graph)
+
+  output:
+    path("${graph}.gz")
+
+  """
+  pigz -q -p ${task.cpus} $graph -f -k
+  """
+}
+
 // TODO ONCE OUR CUSTOM MULTIQC VERSION IS IN A MULTIQC RELEASE, WE CAN CHANGE THIS
 process multiQC {
   publishDir "${params.outdir}", mode: "${params.publish_dir_mode}"
@@ -298,15 +319,15 @@ workflow {
   main:
     if (params.wfmash == false) {
       edyeet(fasta)
-      seqwish(edyeet.out)
+      seqwish(fasta, edyeet.out.collect{it[1]})
     } else {
       wfmash(fasta)
-      seqwish(wfmash.out)
+      seqwish(fasta, wfmash.out.collect{it[1]})
     }
     smoothxg(seqwish.out)
-    if (params.do_stats) { 
-      odgiBuild(seqwish.out.collect{it[1]}.mix(smoothxg.out.gfa_smooth, smoothxg.out.consensus_smooth.flatten())) 
-      odgiStats(odgiBuild.out)  
+    if (params.do_stats) {
+      odgiBuild(seqwish.out.collect{it[1]}.mix(smoothxg.out.gfa_smooth, smoothxg.out.consensus_smooth.flatten()))
+      odgiStats(odgiBuild.out)
     }
     else {
       odgiBuild(smoothxg.out.gfa_smooth)
@@ -322,6 +343,14 @@ workflow {
       odgiDrawOut = odgiDraw(odgiLayout.out)
     }
 
+    if (params.do_compression) {
+        if (params.wfmash == false) {
+            pigzOutputFiles(seqwish.out.collect{it[1]}.mix(smoothxg.out.gfa_smooth, smoothxg.out.consensus_smooth.flatten(), odgiBuild.out, smoothxg.out.maf_smooth, edyeet.out.collect{it[1]}))
+        } else {
+            pigzOutputFiles(seqwish.out.collect{it[1]}.mix(smoothxg.out.gfa_smooth, smoothxg.out.consensus_smooth.flatten(), odgiBuild.out, smoothxg.out.maf_smooth, wfmash.out.collect{it[1]}))
+        }
+    }
+
     multiQC(
       odgiStats.out.collect().ifEmpty([]),
       odgiVizOut.collect().ifEmpty([]),
@@ -391,7 +420,7 @@ def helpMessage() {
       --smoothxg_ratio_contain [n]    minimum short length / long length ratio to compare sequences for the containment
                                       metric in the clustering [default: 0]
       --smoothxg_poa_params [str]     score parameters for POA in the form of match,mismatch,gap1,ext1,gap2,ext2
-                                      [default: 1,4,6,2,26,1]                                         
+                                      [default: 1,4,6,2,26,1]
 
     Visualization options:
       --do_viz                        Generate 1D visualisations of the built graphs [default: OFF]
@@ -405,6 +434,7 @@ def helpMessage() {
       --max_multiqc_email_size [str]  Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB)
       -name [str]                     Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic.
       --file_name_prefix [str]        Prefix for the output file names. If 'pggb', the file names will be very verbose and contain all parameters for each process. [default: --input]
+      --do_compression                Compress alignment (.paf), graph (.gfa, .og), and MSA (.maf) outputs [default: OFF]
 
     AWSBatch options:
       --awsqueue [str]                The AWSBatch JobQueue that needs to be set when running on AWSBatch
@@ -415,7 +445,7 @@ def helpMessage() {
 
 // Has the run name been specified by the user?
 // this has the bonus effect of catching both -name and --name
-// TODO INVOKE THIS AGAIN ONCE IT IS CLEAR HOW TO ADD A NAME TO THE RUN 
+// TODO INVOKE THIS AGAIN ONCE IT IS CLEAR HOW TO ADD A NAME TO THE RUN
 // TODO ERROR: You used a core Nextflow option with two hyphens: '--name'. Please resubmit with '-name'
 /*
 custom_runName = params.name
@@ -664,11 +694,11 @@ process output_documentation {
     publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode
 
     input:
-    file output_docs from ch_output_docs
-    file images from ch_output_docs_images
+      file output_docs from ch_output_docs
+      file images from ch_output_docs_images
 
     output:
-    file 'results_description.html'
+      file 'results_description.html'
 
     script:
     """

diff --git a/nextflow.config b/nextflow.config
@@ -71,6 +71,8 @@ params {
   help = false
   tracedir = "${params.outdir}/pipeline_info"
 
+  do_compression = false
+
   // Config options
   custom_config_version = 'master'
   custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}"

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -38,11 +38,17 @@
                     "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.",
                     "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$"
                 },
+                "do_compression": {
+                    "type": "boolean",
+                    "fa_icon": "fas fa-file-archive",
+                    "description": "Compress output files."
+                },
                 "do_stats": {
                     "type": "boolean",
                     "default": true,
                     "hidden": true,
-                    "fa_icon": "fas fa-file-csv"
+                    "fa_icon": "fas fa-file-csv",
+                    "description": "Perform statistics evaluation."
                 }
             }
         },
@@ -72,26 +78,22 @@
                 "alignment_n_secondary": {
                     "type": "integer",
                     "default": 10,
-                    "description": "Number of secondary mappings to retain in 'map' filter mode.",
-                    "fa_icon": "fab fa-draft2digital"
+                    "description": "Number of secondary mappings to retain in 'map' filter mode."
                 },
                 "alignment_segment_length": {
                     "type": "integer",
                     "default": 10000,
-                    "description": "Segment length for mapping.",
-                    "fa_icon": "fab fa-draft2digital"
+                    "description": "Segment length for mapping."
                 },
                 "alignment_block_length": {
                     "type": "integer",
                     "default": 30000,
-                    "description": "Minimum block length filter for mapping.",
-                    "fa_icon": "fab fa-draft2digital"
+                    "description": "Minimum block length filter for mapping."
                 },
                 "alignment_mash_kmer": {
                     "type": "integer",
                     "default": 16,
-                    "description": "Kmer size for mashmap.",
-                    "fa_icon": "fab fa-draft2digital"
+                    "description": "Kmer size for mashmap."
                 },
                 "alignment_merge_segments": {
                     "type": "boolean",
@@ -119,14 +121,12 @@
                 "seqwish_min_match_length": {
                     "type": "integer",
                     "default": 19,
-                    "description": "Ignore exact matches below this length.",
-                    "fa_icon": "fab fa-draft2digital"
+                    "description": "Ignore exact matches below this length."
                 },
                 "seqwish_transclose_batch": {
                     "type": "integer",
                     "default": 1000000,
-                    "description": "Number of bp to use for transitive closure batch.",
-                    "fa_icon": "fab fa-draft2digital"
+                    "description": "Number of bp to use for transitive closure batch."
                 }
             },
             "fa_icon": "fas fa-dna"
@@ -140,26 +140,22 @@
                 "smoothxg_max_block_weight": {
                     "type": "integer",
                     "default": 10000,
-                    "description": "Maximum seed sequence in block.",
-                    "fa_icon": "fab fa-draft2digital"
+                    "description": "Maximum seed sequence in block."
                 },
                 "smoothxg_max_path_jump": {
                     "type": "integer",
                     "default": 5000,
-                    "description": "Maximum path jump to include in block.",
-                    "fa_icon": "fab fa-draft2digital"
+                    "description": "Maximum path jump to include in block."
                 },
                 "smoothxg_max_edge_jump": {
                     "type": "integer",
                     "default": 5000,
-                    "description": "Maximum edge jump before breaking.",
-                    "fa_icon": "fab fa-draft2digital"
+                    "description": "Maximum edge jump before breaking."
                 },
                 "smoothxg_max_poa_length": {
                     "type": "integer",
                     "default": 10000,
-                    "description": "Maximum sequence length to put into POA.",
-                    "fa_icon": "fab fa-draft2digital"
+                    "description": "Maximum sequence length to put into POA."
                 },
                 "smoothxg_consensus_spec": {
                     "type": "string",