diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index aac970a..492d2e8 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -8,31 +8,15 @@ // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. "dockerfile": "../Dockerfile" }, - // "mounts": [ - // "source=${localWorkspaceFolder},target=/fastqwiper,type=bind,consistency=cached" - // ], - "customizations": { - "vscode": { - "extensions": [ - "ms-python.vscode-pylance", - "ms-python.black-formatter", - "snakemake.snakemake-lang" - ] - } - } - // Features to add to the dev container. More info: https://containers.dev/features. - // "features": {}, - - // Use 'forwardPorts' to make a list of ports inside the container available locally. - // "forwardPorts": [], - - // Uncomment the next line to run commands after the container is created. - // "postCreateCommand": "cat /etc/os-release", - - // Configure tool-specific properties. - // "customizations": {}, - - // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. - // "remoteUser": "devcontainer" + "features": { + "ghcr.io/devcontainers/features/git:1": {} + }, + "customizations" : { + "jetbrains" : { + "backend" : "PyCharm" + } + }, + workspaceFolder: "/fastqwiper", + workspaceMount: "source=${localWorkspaceFolder},target=/fastqwiper,type=bind" } diff --git a/.gitignore b/.gitignore index ce11cad..d61c4b8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ #my custom -.vscode bbmap/ logs/ +bbmap # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index e69de29..0000000 diff --git a/Dockerfile b/Dockerfile index 9ff626f..1c929b3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ LABEL maintainer="mazza.tommaso@gmail.com" ENV bbmap_version 39.01 ENV PATH "$PATH:/tmp/jre1.8.0_161/bin/" -RUN mamba config --set channel_priority strict +# RUN mamba config --set channel_priority strict RUN mamba install python=3.10 RUN mamba install -c conda-forge -c bioconda snakemake=7.32.3 -y RUN mamba install -c conda-forge colorama click -y @@ -28,13 +28,14 @@ WORKDIR /fastqwiper COPY pipeline pipeline COPY run_wiping.sh run_wiping.sh +COPY data data RUN chmod +x run_wiping.sh ENTRYPOINT ["/fastqwiper/run_wiping.sh"] -# paired mode, 4 cores, sample name, #rows-per-chunk -CMD ["paired", "4", "sample", "50000000"] +# paired mode, 4 cores, sample name, #rows-per-chunk, ASCII offset (33=Sanger, 64=old Solexa) +CMD ["paired", "4", "sample", "50000000", "33"] # docker build -t test . -# docker run --rm -ti --name test -v "D:\desktop_links\CSS-Bioinformatics\FastqWiper\FastqWiper\data:/fastqwiper/data" test paired 8 sample 50000000 +# docker run --rm -ti --name test -v "D:\Projects\fastqwiper\data:/fastqwiper/data" test paired 4 sample 50000000 33 # docker exec -ti test /bin/bash \ No newline at end of file diff --git a/README.md b/README.md index 0e729e5..c33cc33 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ There are QUICK and a SLOW methods to configure `FastqWiper`'s wor 2. Once downloaded the image, type: -CMD: `docker run --rm -ti --name fastqwiper -v "YOUR_LOCAL_PATH_TO_DATA_FOLDER:/fastqwiper/data" mazzalab/fastqwiper paired 8 sample 50000000` +CMD: `docker run --rm -ti --name fastqwiper -v "YOUR_LOCAL_PATH_TO_DATA_FOLDER:/fastqwiper/data" mazzalab/fastqwiper paired 8 sample 50000000 33` #### Another quick way (Singularity) 1. Pull the Singularity image from the Cloud Library: @@ -79,11 +79,11 @@ CMD: `docker run --rm -ti --name fastqwiper -v "YOUR_LOCAL_PATH_TO_DATA_FOLDER:/ 2. Once downloaded the image (e.g., fastqwiper.sif_2023.2.70.sif), type: -CMD `singularity run --bind /scratch/tom/fastqwiper_singularity/data:/fastqwiper/data --writable-tmpfs fastqwiper.sif_2023.2.70.sif paired 8 sample 50000000` +CMD `singularity run --bind /scratch/tom/fastqwiper_singularity/data:/fastqwiper/data --writable-tmpfs fastqwiper.sif_2023.2.70.sif paired 8 sample 50000000 33` If you want to bind the `.singularity` cache folder and the `logs` folder, you can omit `--writable-tmpfs`, create the folders `.singularity` and `logs` (`mkdir .singularity logs`) on the host system, and use this command instead: -CMD: `singularity run --bind YOUR_LOCAL_PATH_TO_DATA_FOLDER/:/fastqwiper/data --bind YOUR_LOCAL_PATH_TO_.singularity_FOLDER/:/fastqwiper/.snakemake --bind YOUR_LOCAL_PATH_TO_LOGS_FOLDER/:/fastqwiper/logs fastqwiper.sif_2023.2.70.sif paired 8 sample 50000000` +CMD: `singularity run --bind YOUR_LOCAL_PATH_TO_DATA_FOLDER/:/fastqwiper/data --bind YOUR_LOCAL_PATH_TO_.singularity_FOLDER/:/fastqwiper/.snakemake --bind YOUR_LOCAL_PATH_TO_LOGS_FOLDER/:/fastqwiper/logs fastqwiper.sif_2023.2.70.sif paired 8 sample 50000000 33` For both **Docker** and **Singularity**: @@ -91,8 +91,8 @@ For both **Docker** and **Singularity**: - `paired` triggers the cleaning of R1 and R2. Alternatively, `single` will trigger the wipe of individual FASTQ files; - `8` is the number of your choice of computing cores to be spawned; - `sample` is part of the names of the FASTQ files to be wiped. Be aware that: for paired-end files (e.g., "sample_R1.fastq.gz" and "sample_R2.fastq.gz"), your files must finish with `_R1.fastq.gz` and `_R2.fastq.gz`. Therefore, the argument to pass is everything before these texts: `sample` in this case. For single end/individual files (e.g., "excerpt_R1_001.fastq.gz"), your file must end with the string `.fastq.gz`; the preceding text, i.e., "excerpt_R1_001" in this case, will be the text to be passed to the command as an argument. -- `50000000` is the number of rows-per-chunk (used when cores>1. It must be a number multiple of 4). Increasing this number too much would reduce the parallelism advantage. Decreasing this number too much would increase the number of chunks more than the number of available cpus, making parallelism unefficient. Choose this number wisely depending on the total number of reads in your starting file. - +- `50000000` (optional) is the number of rows-per-chunk (used when cores>1. It must be a number multiple of 4). Increasing this number too much would reduce the parallelism advantage. Decreasing this number too much would increase the number of chunks more than the number of available cpus, making parallelism unefficient. Choose this number wisely depending on the total number of reads in your starting file. +- `33` (optional) is the ASCII offset (33=Sanger, 64=old Solexa) #### The slow way (Linux & Mac OS) To enable the use of preconfigured [pipelines](https://github.com/mazzalab/fastqwiper/tree/main/pipeline), you need to install **Snakemake**. The recommended way to install Snakemake is via Conda, because it enables **Snakemake** to [handle software dependencies of your workflow](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#integrated-package-management). @@ -146,14 +146,14 @@ Copy the fastq files you want to fix in the `data` folder. #### Paired-end files - **Get a dry run** of a pipeline (e.g., `fix_wipe_pairs_reads_sequential.smk`):
-`snakemake --config sample_name=my_sample -s pipeline/fix_wipe_pairs_reads_sequential.smk --use-conda --cores 4` +`snakemake --config sample_name=my_sample qin=33 -s pipeline/fix_wipe_pairs_reads_sequential.smk --use-conda --cores 4` - **Generate the planned DAG**:
-`snakemake --config sample_name=my_sample -s pipeline/fix_wipe_pairs_reads_sequential.smk --dag | dot -Tpdf > dag.pdf`

+`snakemake --config sample_name=my_sample qin=33 -s pipeline/fix_wipe_pairs_reads_sequential.smk --dag | dot -Tpdf > dag.pdf`

- **Run the pipeline** (n.b., during the first execution, Snakemake will download and install some required remote packages and may take longer). The number of computing cores can be tuned accordingly:
-`snakemake --config sample_name=my_sample -s pipeline/fix_wipe_single_reads_sequential.smk --use-conda --cores 2` +`snakemake --config sample_name=my_sample qin=33 -s pipeline/fix_wipe_single_reads_sequential.smk --use-conda --cores 2` Fixed files will be copied in the `data` folder and will be suffixed with the string `_fixed_wiped_paired_interleaving`. We remind that the `fix_wipe_pairs_reads_sequential.smk` and `fix_wipe_pairs_reads_parallel.smk` pipelines perform the following actions: diff --git a/Singularity.def b/Singularity.def index 296ae52..39e3f6b 100644 --- a/Singularity.def +++ b/Singularity.def @@ -39,9 +39,9 @@ From: condaforge/mambaforge chmod 777 /fastqwiper/run_wiping.sh %runscript - if [ $# -eq 4 ] || [ $# -eq 1 ]; then + if [ $# -eq 5 ] || [ $# -eq 3 ] || [ $# -eq 1 ]; then exec /fastqwiper/run_wiping.sh $@ else - echo "You must provide four arguments [mode (paired, single), # of cores (int), sample name (string), chunk size (int))" + echo "You must provide three + 2 optional arguments [computing mode ('paired' or 'single'), # of cores (int), sample name (string), chunk size (optional, int), ASCII offset (optional, 33 or 64)]" exit 1 fi \ No newline at end of file diff --git a/data/excerpt_S1_R1_001.fastq.gz b/data/excerpt_S1_R1_001.fastq.gz deleted file mode 100644 index a887a5b..0000000 Binary files a/data/excerpt_S1_R1_001.fastq.gz and /dev/null differ diff --git a/data/excerpt_S1_R2_001.fastq.gz b/data/excerpt_S1_R2_001.fastq.gz deleted file mode 100644 index 9e530b6..0000000 Binary files a/data/excerpt_S1_R2_001.fastq.gz and /dev/null differ diff --git a/data/osteo-FS_S1_R1_001_chunk_WRONG.fastq b/data/osteo-FS_S1_R1_001_chunk_WRONG.fastq deleted file mode 100644 index da75157..0000000 --- a/data/osteo-FS_S1_R1_001_chunk_WRONG.fastq +++ /dev/null @@ -1,6165 +0,0 @@ -@NS500299:185:HK57NBGXG:1:11101:16144:1046 2:N:0:CGAGGCTG -NGACCAAGGAGGAGAAGAGACCTTTGCTGNCCCANCANGGGNGCTNNNGNTGTCACCTGGACAGGGGGCAGCCGTG -+ -#A {log}" + "bbmap/repair.sh qin={QIN} in={input.in1} in2={input.in2} out={output.out1} out2={output.out2} outsingle={output.out3} 2> {log}" onsuccess: print("Workflow finished, no error. Clean-up and shutdown") diff --git a/pipeline/fix_wipe_pairs_reads_sequential.smk b/pipeline/fix_wipe_pairs_reads_sequential.smk index 3ef4f01..f740194 100644 --- a/pipeline/fix_wipe_pairs_reads_sequential.smk +++ b/pipeline/fix_wipe_pairs_reads_sequential.smk @@ -3,6 +3,7 @@ from snakemake.io import expand, temp SAMPLES=config["sample_name"] +QIN=config["qin"] rule all: input: @@ -62,9 +63,9 @@ rule fix_interleaving: log: "logs/interleaving/interleaving.{sample}.log" message: - "Repair reads interleaving from {input}." + "Repair reads interleaving from {input} (qin={QIN})." threads: 1 cache: False shell: - "bbmap/repair.sh in={input.in1} in2={input.in2} out={output.out1} out2={output.out2} outsingle={output.out3} 2> {log}" + "bbmap/repair.sh qin={QIN} in={input.in1} in2={input.in2} out={output.out1} out2={output.out2} outsingle={output.out3} 2> {log}" diff --git a/run_wiping.sh b/run_wiping.sh index 2139edf..2ac24a3 100644 --- a/run_wiping.sh +++ b/run_wiping.sh @@ -1,34 +1,50 @@ #!/bin/bash -mode=$1 -cores=$(($2)) -sample_name=$3 -chunk_size=$(($4)) +if [ $# -eq 0 ]; then + mode="paired" + cores=4 + sample_name="sample" + chunk_size=50000000 + qin=33 -# Enter the FastqWiper folder -cd /fastqwiper + echo "Running with custom arguments: " "$@" +elif [ $# -ge 3 ] && [ $# -le 5 ]; then + mode=$1 + cores=$(($2)) + sample_name=$3 -if [ $mode == "paired" ] -then - if [ $cores -gt 1 ] - then - echo "Processing paired-end files in parallel" - snakemake --config sample_name=$sample_name chunk_size=$chunk_size -s ./pipeline/fix_wipe_pairs_reads_parallel.smk --use-conda --cores $cores - else - echo "Processing paired-end files sequentially" - snakemake --config sample_name=$sample_name -s ./pipeline/fix_wipe_pairs_reads_sequential.smk --use-conda --cores $cores + # Optional + chunk_size=$(($4)) + if [ "$chunk_size" -eq "0" ]; then + chunk_size=50000000 + fi + + # Optional + qin=$(($5)) + if [ "$qin" -eq "0" ]; then + qin=33 fi -elif [ $mode == "single" ] -then - if [ $cores -gt 1 ] - then - echo "Processing single-end file in parallel" - snakemake --config sample_name=$sample_name chunk_size=$chunk_size -s ./pipeline/fix_wipe_single_reads_parallel.smk --use-conda --cores $cores + + if [ "$mode" == "paired" ]; then + if [ "$cores" -gt 1 ]; then + echo "Processing paired-end files in parallel" + snakemake --config sample_name=$sample_name chunk_size=$chunk_size qin=$qin -s ./pipeline/fix_wipe_pairs_reads_parallel.smk --use-conda --cores $cores + else + echo "Processing paired-end files sequentially" + snakemake --config sample_name=$sample_name qin=$qin -s ./pipeline/fix_wipe_pairs_reads_sequential.smk --use-conda --cores $cores + fi + elif [ "$mode" == "single" ]; then + if [ "$cores" -gt 1 ]; then + echo "Processing single-end file in parallel" + snakemake --config sample_name=$sample_name chunk_size=$chunk_size -s ./pipeline/fix_wipe_single_reads_parallel.smk --use-conda --cores $cores + else + echo "Processing single-end file sequentially" + snakemake --config sample_name=$sample_name -s ./pipeline/fix_wipe_single_reads_sequential.smk --use-conda --cores $cores + fi else - echo "Processing single-end file sequentially" - snakemake --config sample_name=$sample_name -s ./pipeline/fix_wipe_single_reads_sequential.smk --use-conda --cores $cores + echo "Allowed computing modes are: 'paired' or 'single'" fi else - echo "Snakemake help" - snakemake --help + echo "You must provide three + 2 optional arguments [computing mode ('paired' or 'single'), # of cores (int), sample name (string), chunk size (optional, int), ASCII offset (optional, 33 or 64)]" + exit 1 fi