Merge pull request #193 from skrakau/compress_cat_files

Compress CAT output files
nf-core · May 20, 2021 · b1c378a · b1c378a
2 parents 3ced95c + 8217b16
commit b1c378a
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#179](https://github.com/nf-core/mag/pull/179) - By default BUSCO now performs automated lineage selection instead of using the bacteria_odb10 lineage as reference. Specific lineage datasets can still be provided via `--busco_reference`.
 - [#178](https://github.com/nf-core/mag/pull/178) - Change output file: `results/GenomeBinning/QC/quast_and_busco_summary.tsv` -> `results/GenomeBinning/bin_summary.tsv`, contains GTDB-Tk results as well.
 - [#191](https://github.com/nf-core/mag/pull/191) - Update to nf-core 1.14 `TEMPLATE`
+- [#193](https://github.com/nf-core/mag/pull/193) - Compress CAT output files [#180](https://github.com/nf-core/mag/issues/180)
 
 ### `Fixed`
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -126,6 +126,7 @@ params {
         'cat' {
             publish_by_id  = true
             publish_dir    = "Taxonomy/CAT"
+            publish_files  = ['log':'', 'gz':'']
         }
         'gtdbtk_classify' {
             args           = "--extension fa"

diff --git a/docs/output.md b/docs/output.md
@@ -265,12 +265,14 @@ Besides the reference files or output files created by BUSCO, the following summ
 **Output files:**
 
 * `Taxonomy/CAT/[assembler]/`
-  * `[assembler]-[sample/group].ORF2LCA.txt`: Tab-delimited files containing the lineage of each contig
-  * `[assembler]-[sample/group].names.txt`: Taxonomy classification, with names of each lineage levels instead og taxids
-  * `[assembler]-[sample/group].predicted_proteins.faa`: predicted protein sequences for each genome bins, in fasta format
-  * `[assembler]-[sample/group].predicted_proteins.gff`: predicted protein features for each genome bins, in gff format
+  * `[assembler]-[sample/group].ORF2LCA.names.txt.gz`: Tab-delimited files containing the lineage of each contig, with full lineage names
+  * `[assembler]-[sample/group].bin2classification.names.txt.gz`: Taxonomy classification of the genome bins, with full lineage names
+* `Taxonomy/CAT/[assembler]/raw/`
+  * `[assembler]-[sample/group].concatenated.predicted_proteins.faa.gz`: Predicted protein sequences for each genome bin, in fasta format
+  * `[assembler]-[sample/group].concatenated.predicted_proteins.gff.gz`: Predicted protein features for each genome bin, in gff format
+  * `[assembler]-[sample/group].ORF2LCA.txt.gz`: Tab-delimited files containing the lineage of each contig
+  * `[assembler]-[sample/group].bin2classification.txt.gz`: Taxonomy classification of the genome bins
   * `[assembler]-[sample/group].log`: Log files
-  * `[assembler]-[sample/group].bin2classification.txt`: Taxonomy classification of the genome bins
 
 ### GTDB-Tk
 

diff --git a/modules/local/cat.nf b/modules/local/cat.nf
@@ -23,22 +23,29 @@ process CAT {
     tuple val(db_name), path("database/*"), path("taxonomy/*")
 
     output:
-    path("*.names.txt")                 , emit: tax_classification
-    path("raw/*.ORF2LCA.txt")           , emit: orf2lca
-    path("raw/*.predicted_proteins.faa"), emit: faa
-    path("raw/*.predicted_proteins.gff"), emit: gff
-    path("raw/*.log")                   , emit: log
-    path("raw/*.bin2classification.txt"), emit: tax_classification_taxids
-    path '*.version.txt'                , emit: version
+    path("*.names.txt.gz")                 , emit: tax_classification
+    path("raw/*.ORF2LCA.txt.gz")           , emit: orf2lca
+    path("raw/*.predicted_proteins.faa.gz"), emit: faa
+    path("raw/*.predicted_proteins.gff.gz"), emit: gff
+    path("raw/*.log")                      , emit: log
+    path("raw/*.bin2classification.txt.gz"), emit: tax_classification_taxids
+    path '*.version.txt'                   , emit: version
 
     script:
     def software = getSoftwareName(task.process)
     """
     CAT bins -b "bins/" -d database/ -t taxonomy/ -n "${task.cpus}" -s .fa --top 6 -o "${meta.assembler}-${meta.id}" --I_know_what_Im_doing
     CAT add_names -i "${meta.assembler}-${meta.id}.ORF2LCA.txt" -o "${meta.assembler}-${meta.id}.ORF2LCA.names.txt" -t taxonomy/
     CAT add_names -i "${meta.assembler}-${meta.id}.bin2classification.txt" -o "${meta.assembler}-${meta.id}.bin2classification.names.txt" -t taxonomy/
+
     mkdir raw
     mv *.ORF2LCA.txt *.predicted_proteins.faa *.predicted_proteins.gff *.log *.bin2classification.txt raw/
+    gzip "raw/${meta.assembler}-${meta.id}.ORF2LCA.txt" \
+         "raw/${meta.assembler}-${meta.id}.concatenated.predicted_proteins.faa" \
+         "raw/${meta.assembler}-${meta.id}.concatenated.predicted_proteins.gff" \
+         "raw/${meta.assembler}-${meta.id}.bin2classification.txt" \
+         "${meta.assembler}-${meta.id}.ORF2LCA.names.txt" \
+         "${meta.assembler}-${meta.id}.bin2classification.names.txt"
 
     CAT --version | sed "s/CAT v//; s/(.*//" > ${software}.version.txt
     """