Merge pull request #129 from icgc-argo-workflows/seq-data-to-lane-fas…

…tq@0.1.0 [release]
icgc-argo-workflows · Jun 15, 2022 · edd1ba4 · edd1ba4
2 parents 654df1d + 7106202
commit edd1ba4
Show file tree

Hide file tree

Showing 22 changed files with 617 additions and 0 deletions.
diff --git a/seq-data-to-lane-fastq/.dockerignore b/seq-data-to-lane-fastq/.dockerignore
@@ -0,0 +1,5 @@
+.gitignore
+.nextflow*
+tests
+work
+outdir
diff --git a/seq-data-to-lane-fastq/Dockerfile b/seq-data-to-lane-fastq/Dockerfile
@@ -0,0 +1,31 @@
+FROM ubuntu:20.04
+
+LABEL org.opencontainers.image.source https://github.com/icgc-argo-workflows/dna-seq-processing-tools
+
+ENV DEBIAN_FRONTEND noninteractive
+
+RUN apt-get update -y && \
+    apt-get install -y software-properties-common python3-pip python3-dev curl && \
+    apt-get install -y libz-dev pkg-config libtool m4 autotools-dev automake libncurses5-dev libbz2-dev liblzma-dev
+
+# install samtools
+ARG SAMTOOLS_VERSION=1.15
+RUN cd /tmp \
+    && curl -sSL -o samtools-$SAMTOOLS_VERSION.tar.bz2 --retry 10 https://github.com/samtools/samtools/releases/download/$SAMTOOLS_VERSION/samtools-$SAMTOOLS_VERSION.tar.bz2 \
+    && bunzip2 -c samtools-$SAMTOOLS_VERSION.tar.bz2 |tar xf - \
+    && cd samtools-$SAMTOOLS_VERSION \
+    && ./configure --prefix=/usr/local \
+    && make \
+    && make install
+
+ENV PATH="/tools:${PATH}"
+
+COPY *.py /tools/
+
+RUN groupadd -g 1000 ubuntu && \
+    useradd -l -u 1000 -g ubuntu ubuntu && \
+    install -d -m 0755 -o ubuntu -g ubuntu /home/ubuntu
+
+ENTRYPOINT ["/usr/bin/env"]
+
+CMD ["/bin/bash"]
diff --git a/seq-data-to-lane-fastq/README.md b/seq-data-to-lane-fastq/README.md
@@ -0,0 +1,48 @@
+# Nextflow Package `seq-data-to-lane-fastq`
+
+A simple wrapper written in `nextflow` for the sequencing processing tool to convert all input sequencing data into unaligned and lane level fastq files.  
+The tool support both aligned bam or unaligned fastq formats with paired or single end reads.
+
+## Package development
+
+The initial version of this package was created by the WorkFlow Package Manager CLI tool, please refer to
+the [documentation](https://wfpm.readthedocs.io) for details on the development procedure including
+versioning, updating, CI testing and releasing.
+
+
+## Inputs
+### Required
+- `metadata_json`: JSON file contains donor/sample/specimen/experiment/read_groups/files metadata
+- `seq_files`: Sequencing reads in aligned BAM or unaligned FASTQ formats. Supported input format: {BAM, *.fq.gz, *.fastq.gz, *.fq.bz2, *.fastq.bz2}
+
+### Optional
+- `reads_max_discard_fraction`: Max fraction of reads allowed to be discarded when reverting aligned BAM to unaligned
+- `tempdir`: Specify directory for temporary files
+- `cpus`: Set cpu number for running the tool
+- `mem`: Set memory(G) for running the tool
+- `publish_dir`: Specify directory for getting output files
+
+## Outputs
+- `lane_fastq`: All fastq files 
+- `file_pair_map_csv`: CSV file contains the 3 columns per lane: `read_group_id`, `file_r1`, `file_r2` 
+
+## Usage
+
+### Run the package directly
+
+With inputs prepared, you should be able to run the package directly using the following command.
+Please replace the params file with a real one (with all required parameters and input files). Example
+params file(s) can be found in the `tests` folder.
+
+```
+nextflow run icgc-argo-workflows/dna-seq-processing-tools/seq-data-to-lane-fastq/main.nf -r seq-data-to-lane-fastq.v0.1.0 -params-file <your-params-json-file>
+```
+
+### Import the package as a dependency
+
+To import this package into another package as a dependency, please follow these steps at the
+importing package side:
+
+1. add this package's URI `github.com/icgc-argo-workflows/dna-seq-processing-tools/seq-data-to-lane-fastq@0.1.0` in the `dependencies` list of the `pkg.json` file
+2. run `wfpm install` to install the dependency
+3. add the `include` statement in the main Nextflow script to import the dependent package from this path: `./wfpr_modules/github.com/icgc-argo-workflows/dna-seq-processing-tools/seq-data-to-lane-fastq@0.1.0/main.nf`
diff --git a/seq-data-to-lane-fastq/main.nf b/seq-data-to-lane-fastq/main.nf
@@ -0,0 +1,94 @@
+#!/usr/bin/env nextflow
+
+/*
+  Copyright (C) 2021,  icgc-argo
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU Affero General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU Affero General Public License for more details.
+
+  You should have received a copy of the GNU Affero General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+  Authors:
+    Linda Xiang
+*/
+
+/********************************************************************/
+/* this block is auto-generated based on info from pkg.json where   */
+/* changes can be made if needed, do NOT modify this block manually */
+nextflow.enable.dsl = 2
+version = '0.1.0'  // package version
+
+container = [
+    'ghcr.io': 'ghcr.io/icgc-argo-workflows/dna-seq-processing-tools.seq-data-to-lane-fastq'
+]
+default_container_registry = 'ghcr.io'
+/********************************************************************/
+
+
+// universal params go here
+params.container_registry = ""
+params.container_version = ""
+params.container = ""
+
+params.cpus = 1
+params.mem = 1  // GB
+params.publish_dir = ""  // set to empty string will disable publishDir
+
+
+// tool specific parmas go here, add / change as needed
+params.metadata_json = ""
+params.seq_files = ""
+params.reads_max_discard_fraction = 0.05
+params.tempdir = "NO_DIR"
+
+
+process seqDataToLaneFastq {
+  container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
+  publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir ? true : false
+
+  cpus params.cpus
+  memory "${params.mem} GB"
+
+  input:  // input, make update as needed
+    path metadata_json
+    path seq
+
+  output:  // output, make update as needed
+    path "out/*{fq,fastq,fq.gz,fastq.gz}", emit: lane_fastq
+    path "out/rgs_file_pair_map.csv", emit: file_pair_map_csv
+
+  script:
+    // add and initialize variables here as needed
+
+    arg_tempdir = params.tempdir != 'NO_DIR' ? "-t ${params.tempdir}" : ""
+
+    """
+    mkdir -p out
+
+    main.py \
+      -p ${metadata_json} \
+      -s ${seq} \
+      -d ${params.reads_max_discard_fraction} \
+      -n ${params.cpus} \
+      -o out ${arg_tempdir}
+    
+    """
+}
+
+
+// this provides an entry point for this main script, so it can be run directly without clone the repo
+// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
+workflow {
+  seqDataToLaneFastq(
+    file(params.metadata_json),
+    Channel.fromPath(params.seq_files).collect()
+  )
+}