-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #129 from icgc-argo-workflows/seq-data-to-lane-fas…
…tq@0.1.0 [release]
- Loading branch information
Showing
22 changed files
with
617 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
.gitignore | ||
.nextflow* | ||
tests | ||
work | ||
outdir |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
FROM ubuntu:20.04 | ||
|
||
LABEL org.opencontainers.image.source https://github.com/icgc-argo-workflows/dna-seq-processing-tools | ||
|
||
ENV DEBIAN_FRONTEND noninteractive | ||
|
||
RUN apt-get update -y && \ | ||
apt-get install -y software-properties-common python3-pip python3-dev curl && \ | ||
apt-get install -y libz-dev pkg-config libtool m4 autotools-dev automake libncurses5-dev libbz2-dev liblzma-dev | ||
|
||
# install samtools | ||
ARG SAMTOOLS_VERSION=1.15 | ||
RUN cd /tmp \ | ||
&& curl -sSL -o samtools-$SAMTOOLS_VERSION.tar.bz2 --retry 10 https://github.com/samtools/samtools/releases/download/$SAMTOOLS_VERSION/samtools-$SAMTOOLS_VERSION.tar.bz2 \ | ||
&& bunzip2 -c samtools-$SAMTOOLS_VERSION.tar.bz2 |tar xf - \ | ||
&& cd samtools-$SAMTOOLS_VERSION \ | ||
&& ./configure --prefix=/usr/local \ | ||
&& make \ | ||
&& make install | ||
|
||
ENV PATH="/tools:${PATH}" | ||
|
||
COPY *.py /tools/ | ||
|
||
RUN groupadd -g 1000 ubuntu && \ | ||
useradd -l -u 1000 -g ubuntu ubuntu && \ | ||
install -d -m 0755 -o ubuntu -g ubuntu /home/ubuntu | ||
|
||
ENTRYPOINT ["/usr/bin/env"] | ||
|
||
CMD ["/bin/bash"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# Nextflow Package `seq-data-to-lane-fastq` | ||
|
||
A simple wrapper written in `nextflow` for the sequencing processing tool to convert all input sequencing data into unaligned and lane level fastq files. | ||
The tool support both aligned bam or unaligned fastq formats with paired or single end reads. | ||
|
||
## Package development | ||
|
||
The initial version of this package was created by the WorkFlow Package Manager CLI tool, please refer to | ||
the [documentation](https://wfpm.readthedocs.io) for details on the development procedure including | ||
versioning, updating, CI testing and releasing. | ||
|
||
|
||
## Inputs | ||
### Required | ||
- `metadata_json`: JSON file contains donor/sample/specimen/experiment/read_groups/files metadata | ||
- `seq_files`: Sequencing reads in aligned BAM or unaligned FASTQ formats. Supported input format: {BAM, *.fq.gz, *.fastq.gz, *.fq.bz2, *.fastq.bz2} | ||
|
||
### Optional | ||
- `reads_max_discard_fraction`: Max fraction of reads allowed to be discarded when reverting aligned BAM to unaligned | ||
- `tempdir`: Specify directory for temporary files | ||
- `cpus`: Set cpu number for running the tool | ||
- `mem`: Set memory(G) for running the tool | ||
- `publish_dir`: Specify directory for getting output files | ||
|
||
## Outputs | ||
- `lane_fastq`: All fastq files | ||
- `file_pair_map_csv`: CSV file contains the 3 columns per lane: `read_group_id`, `file_r1`, `file_r2` | ||
|
||
## Usage | ||
|
||
### Run the package directly | ||
|
||
With inputs prepared, you should be able to run the package directly using the following command. | ||
Please replace the params file with a real one (with all required parameters and input files). Example | ||
params file(s) can be found in the `tests` folder. | ||
|
||
``` | ||
nextflow run icgc-argo-workflows/dna-seq-processing-tools/seq-data-to-lane-fastq/main.nf -r seq-data-to-lane-fastq.v0.1.0 -params-file <your-params-json-file> | ||
``` | ||
|
||
### Import the package as a dependency | ||
|
||
To import this package into another package as a dependency, please follow these steps at the | ||
importing package side: | ||
|
||
1. add this package's URI `github.com/icgc-argo-workflows/dna-seq-processing-tools/seq-data-to-lane-fastq@0.1.0` in the `dependencies` list of the `pkg.json` file | ||
2. run `wfpm install` to install the dependency | ||
3. add the `include` statement in the main Nextflow script to import the dependent package from this path: `./wfpr_modules/github.com/icgc-argo-workflows/dna-seq-processing-tools/seq-data-to-lane-fastq@0.1.0/main.nf` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
#!/usr/bin/env nextflow | ||
|
||
/* | ||
Copyright (C) 2021, icgc-argo | ||
This program is free software: you can redistribute it and/or modify | ||
it under the terms of the GNU Affero General Public License as published by | ||
the Free Software Foundation, either version 3 of the License, or | ||
(at your option) any later version. | ||
This program is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
GNU Affero General Public License for more details. | ||
You should have received a copy of the GNU Affero General Public License | ||
along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
Authors: | ||
Linda Xiang | ||
*/ | ||
|
||
/********************************************************************/ | ||
/* this block is auto-generated based on info from pkg.json where */ | ||
/* changes can be made if needed, do NOT modify this block manually */ | ||
nextflow.enable.dsl = 2 | ||
version = '0.1.0' // package version | ||
|
||
container = [ | ||
'ghcr.io': 'ghcr.io/icgc-argo-workflows/dna-seq-processing-tools.seq-data-to-lane-fastq' | ||
] | ||
default_container_registry = 'ghcr.io' | ||
/********************************************************************/ | ||
|
||
|
||
// universal params go here | ||
params.container_registry = "" | ||
params.container_version = "" | ||
params.container = "" | ||
|
||
params.cpus = 1 | ||
params.mem = 1 // GB | ||
params.publish_dir = "" // set to empty string will disable publishDir | ||
|
||
|
||
// tool specific parmas go here, add / change as needed | ||
params.metadata_json = "" | ||
params.seq_files = "" | ||
params.reads_max_discard_fraction = 0.05 | ||
params.tempdir = "NO_DIR" | ||
|
||
|
||
process seqDataToLaneFastq { | ||
container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}" | ||
publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir ? true : false | ||
|
||
cpus params.cpus | ||
memory "${params.mem} GB" | ||
|
||
input: // input, make update as needed | ||
path metadata_json | ||
path seq | ||
|
||
output: // output, make update as needed | ||
path "out/*{fq,fastq,fq.gz,fastq.gz}", emit: lane_fastq | ||
path "out/rgs_file_pair_map.csv", emit: file_pair_map_csv | ||
|
||
script: | ||
// add and initialize variables here as needed | ||
|
||
arg_tempdir = params.tempdir != 'NO_DIR' ? "-t ${params.tempdir}" : "" | ||
|
||
""" | ||
mkdir -p out | ||
main.py \ | ||
-p ${metadata_json} \ | ||
-s ${seq} \ | ||
-d ${params.reads_max_discard_fraction} \ | ||
-n ${params.cpus} \ | ||
-o out ${arg_tempdir} | ||
""" | ||
} | ||
|
||
|
||
// this provides an entry point for this main script, so it can be run directly without clone the repo | ||
// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx | ||
workflow { | ||
seqDataToLaneFastq( | ||
file(params.metadata_json), | ||
Channel.fromPath(params.seq_files).collect() | ||
) | ||
} |
Oops, something went wrong.