Skip to content

Commit

Permalink
Merge pull request #129 from icgc-argo-workflows/seq-data-to-lane-fas…
Browse files Browse the repository at this point in the history
…tq@0.1.0

[release]
  • Loading branch information
lindaxiang authored Jun 15, 2022
2 parents 654df1d + 7106202 commit edd1ba4
Show file tree
Hide file tree
Showing 22 changed files with 617 additions and 0 deletions.
5 changes: 5 additions & 0 deletions seq-data-to-lane-fastq/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
.gitignore
.nextflow*
tests
work
outdir
31 changes: 31 additions & 0 deletions seq-data-to-lane-fastq/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
FROM ubuntu:20.04

LABEL org.opencontainers.image.source https://github.com/icgc-argo-workflows/dna-seq-processing-tools

ENV DEBIAN_FRONTEND noninteractive

RUN apt-get update -y && \
apt-get install -y software-properties-common python3-pip python3-dev curl && \
apt-get install -y libz-dev pkg-config libtool m4 autotools-dev automake libncurses5-dev libbz2-dev liblzma-dev

# install samtools
ARG SAMTOOLS_VERSION=1.15
RUN cd /tmp \
&& curl -sSL -o samtools-$SAMTOOLS_VERSION.tar.bz2 --retry 10 https://github.com/samtools/samtools/releases/download/$SAMTOOLS_VERSION/samtools-$SAMTOOLS_VERSION.tar.bz2 \
&& bunzip2 -c samtools-$SAMTOOLS_VERSION.tar.bz2 |tar xf - \
&& cd samtools-$SAMTOOLS_VERSION \
&& ./configure --prefix=/usr/local \
&& make \
&& make install

ENV PATH="/tools:${PATH}"

COPY *.py /tools/

RUN groupadd -g 1000 ubuntu && \
useradd -l -u 1000 -g ubuntu ubuntu && \
install -d -m 0755 -o ubuntu -g ubuntu /home/ubuntu

ENTRYPOINT ["/usr/bin/env"]

CMD ["/bin/bash"]
48 changes: 48 additions & 0 deletions seq-data-to-lane-fastq/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Nextflow Package `seq-data-to-lane-fastq`

A simple wrapper written in `nextflow` for the sequencing processing tool to convert all input sequencing data into unaligned and lane level fastq files.
The tool support both aligned bam or unaligned fastq formats with paired or single end reads.

## Package development

The initial version of this package was created by the WorkFlow Package Manager CLI tool, please refer to
the [documentation](https://wfpm.readthedocs.io) for details on the development procedure including
versioning, updating, CI testing and releasing.


## Inputs
### Required
- `metadata_json`: JSON file contains donor/sample/specimen/experiment/read_groups/files metadata
- `seq_files`: Sequencing reads in aligned BAM or unaligned FASTQ formats. Supported input format: {BAM, *.fq.gz, *.fastq.gz, *.fq.bz2, *.fastq.bz2}

### Optional
- `reads_max_discard_fraction`: Max fraction of reads allowed to be discarded when reverting aligned BAM to unaligned
- `tempdir`: Specify directory for temporary files
- `cpus`: Set cpu number for running the tool
- `mem`: Set memory(G) for running the tool
- `publish_dir`: Specify directory for getting output files

## Outputs
- `lane_fastq`: All fastq files
- `file_pair_map_csv`: CSV file contains the 3 columns per lane: `read_group_id`, `file_r1`, `file_r2`

## Usage

### Run the package directly

With inputs prepared, you should be able to run the package directly using the following command.
Please replace the params file with a real one (with all required parameters and input files). Example
params file(s) can be found in the `tests` folder.

```
nextflow run icgc-argo-workflows/dna-seq-processing-tools/seq-data-to-lane-fastq/main.nf -r seq-data-to-lane-fastq.v0.1.0 -params-file <your-params-json-file>
```

### Import the package as a dependency

To import this package into another package as a dependency, please follow these steps at the
importing package side:

1. add this package's URI `github.com/icgc-argo-workflows/dna-seq-processing-tools/seq-data-to-lane-fastq@0.1.0` in the `dependencies` list of the `pkg.json` file
2. run `wfpm install` to install the dependency
3. add the `include` statement in the main Nextflow script to import the dependent package from this path: `./wfpr_modules/github.com/icgc-argo-workflows/dna-seq-processing-tools/seq-data-to-lane-fastq@0.1.0/main.nf`
94 changes: 94 additions & 0 deletions seq-data-to-lane-fastq/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#!/usr/bin/env nextflow

/*
Copyright (C) 2021, icgc-argo
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Authors:
Linda Xiang
*/

/********************************************************************/
/* this block is auto-generated based on info from pkg.json where */
/* changes can be made if needed, do NOT modify this block manually */
nextflow.enable.dsl = 2
version = '0.1.0' // package version

container = [
'ghcr.io': 'ghcr.io/icgc-argo-workflows/dna-seq-processing-tools.seq-data-to-lane-fastq'
]
default_container_registry = 'ghcr.io'
/********************************************************************/


// universal params go here
params.container_registry = ""
params.container_version = ""
params.container = ""

params.cpus = 1
params.mem = 1 // GB
params.publish_dir = "" // set to empty string will disable publishDir


// tool specific parmas go here, add / change as needed
params.metadata_json = ""
params.seq_files = ""
params.reads_max_discard_fraction = 0.05
params.tempdir = "NO_DIR"


process seqDataToLaneFastq {
container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir ? true : false

cpus params.cpus
memory "${params.mem} GB"

input: // input, make update as needed
path metadata_json
path seq

output: // output, make update as needed
path "out/*{fq,fastq,fq.gz,fastq.gz}", emit: lane_fastq
path "out/rgs_file_pair_map.csv", emit: file_pair_map_csv

script:
// add and initialize variables here as needed

arg_tempdir = params.tempdir != 'NO_DIR' ? "-t ${params.tempdir}" : ""

"""
mkdir -p out
main.py \
-p ${metadata_json} \
-s ${seq} \
-d ${params.reads_max_discard_fraction} \
-n ${params.cpus} \
-o out ${arg_tempdir}
"""
}


// this provides an entry point for this main script, so it can be run directly without clone the repo
// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
workflow {
seqDataToLaneFastq(
file(params.metadata_json),
Channel.fromPath(params.seq_files).collect()
)
}
Loading

0 comments on commit edd1ba4

Please sign in to comment.