From 6c00019a84487c50301342fff4e8a2987302cb48 Mon Sep 17 00:00:00 2001 From: mazzalab Date: Sun, 19 May 2024 21:54:23 +0200 Subject: [PATCH 1/5] Removing dependencies and fixing fix_header_line method --- environment.yml | 2 - fastq_wiper/__init__.py | 52 -------- fastq_wiper/color_log_formatter.py | 57 -------- fastq_wiper/wiper.py | 202 +++++++++++++---------------- requirements.txt | 4 - 5 files changed, 87 insertions(+), 230 deletions(-) delete mode 100644 fastq_wiper/color_log_formatter.py delete mode 100644 requirements.txt diff --git a/environment.yml b/environment.yml index 3430e32..9efbf89 100644 --- a/environment.yml +++ b/environment.yml @@ -3,8 +3,6 @@ channels: - conda-forge - defaults dependencies: - - colorama - - click - setuptools - pytest - pytest-cov \ No newline at end of file diff --git a/fastq_wiper/__init__.py b/fastq_wiper/__init__.py index 7f3ba3a..e69de29 100644 --- a/fastq_wiper/__init__.py +++ b/fastq_wiper/__init__.py @@ -1,52 +0,0 @@ -"""" -File description -""" - -import logging -from fastq_wiper.color_log_formatter import ColorFormatter - -__author__ = "Tommaso Mazza" -__copyright__ = "Copyright 2020, The FastqWiper Project" -__version__ = "0.0.1" -__maintainer__ = "Tommaso Mazza" -__email__ = "bioinformatics@css-mendel.it" -__status__ = "Development" -__date__ = "28/12/2020" -__creator__ = "t.mazza" -__license__ = u""" - Copyright (C) 2020 Tommaso Mazza - Viale Regina Margherita 261, 00198 Rome, Italy - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - 02110-1301 USA - """ - -# logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) -# logging.getLogger(__name__).addHandler(logging.NullHandler()) -log = logging.getLogger(__name__) -log.setLevel(logging.DEBUG) - -# DEBUGFORMATTER = '%(filename)s:%(name)s:%(funcName)s:%(lineno)d: %(message)s' -DEBUGFORMATTER = '%(asctime)s - %(name)s/%(filename)s(%(funcName)s) - %(levelname)s - %(message)s' -"""Debug file formatter.""" - -# defines the stream handler -ch = logging.StreamHandler() # creates the handler -ch.setLevel(logging.INFO) # sets the handler info -cf = ColorFormatter(DEBUGFORMATTER) -ch.setFormatter(cf) # sets the handler formatting - -# adds the handler to the global variable: log -log.addHandler(ch) \ No newline at end of file diff --git a/fastq_wiper/color_log_formatter.py b/fastq_wiper/color_log_formatter.py deleted file mode 100644 index 8c94b2d..0000000 --- a/fastq_wiper/color_log_formatter.py +++ /dev/null @@ -1,57 +0,0 @@ -"""" -File description -""" - -import colorama -import logging -from copy import copy - -__author__ = "Tommaso Mazza" -__copyright__ = "Copyright 2020, The FastqWiper Project" -__version__ = "0.0.1" -__maintainer__ = "Tommaso Mazza" -__email__ = "bioinformatics@css-mendel.it" -__status__ = "Development" -__date__ = "28/12/2020" -__creator__ = "t.mazza" -__license__ = u""" - Copyright (C) 2020 Tommaso Mazza - Viale Regina Margherita 261, 00198 Rome, Italy - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - 02110-1301 USA - """ - -LOG_COLORS = { - logging.INFO: colorama.Fore.BLUE, - logging.ERROR: colorama.Fore.RED, - logging.WARNING: colorama.Fore.YELLOW -} - - -class ColorFormatter(logging.Formatter): - def format(self, record, *args, **kwargs): - # if the corresponding logger has children, they may receive modified - # record, so we want to keep it intact - new_record = copy(record) - if new_record.levelno in LOG_COLORS: - # we want levelname to be in different color, so let's modify it - new_record.levelname = "{color_begin}{level}{color_end}".format( - level=new_record.levelname, - color_begin=LOG_COLORS[new_record.levelno], - color_end=colorama.Style.RESET_ALL, - ) - # now we can let standard formatting take care of the rest - return super(ColorFormatter, self).format(new_record, *args, **kwargs) diff --git a/fastq_wiper/wiper.py b/fastq_wiper/wiper.py index 8a1250c..91069b6 100644 --- a/fastq_wiper/wiper.py +++ b/fastq_wiper/wiper.py @@ -3,15 +3,13 @@ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -from colorama import init - -init(convert=True) -import click import gzip import re import codecs -from fastq_wiper import log +import argparse +import logging +logging.basicConfig(level=logging.DEBUG) # region Variables for final report tot_lines: int = 0 @@ -71,18 +69,23 @@ def write_fastq_file(file_path: str): def fix_header_line(line: str, checkpoint_flags: dict) -> str: header: str = line.rstrip() - # Drop all character preceding the last @ character of the header - if header.rfind("@") > 0: - header = header[header.rfind("@") :] - - if header == "@": - checkpoint_flags["at"] = False - else: + # TODO: FIX this method + if header.startswith("@") and len(header) > 1: checkpoint_flags["at"] = True - - global head_at - head_at += 1 - + else: + if header.startswith("@") and len(header) == 1: + checkpoint_flags["at"] = False + else: + # Drop all character preceding the last @ character of the header + if header.rfind("@") > 0: + header = header[header.rfind("@"):] + checkpoint_flags["at"] = True + + global head_at + head_at += 1 + else: + checkpoint_flags["at"] = False + return header @@ -141,7 +144,7 @@ def skip_empty_lines(line: str, fin) -> str: def print_log_during_reading(lines, log_frequency): if lines % log_frequency == 0: - log.info(f"Cleaned {lines} reads") + logging.info(f"Cleaned {lines} reads") def print_to_file(header, raw_seq, head_qual_sep, qual, fout): @@ -166,9 +169,7 @@ def print_log_to_file(log_out): flog.write(f"BAD QUAL lines: {qual_out_range}/{tot_lines}" + "\n") flog.write(f"BAD '+' lines: {plus_row}/{tot_lines}" + "\n") flog.write(f"BAD SEQ lines: {seq_odd_chars}/{tot_lines}" + "\n") - flog.write( - f"Not printable or uncompliant header lines: {head_print}/{tot_lines}" + "\n" - ) + flog.write(f"Not printable or uncompliant header lines: {head_print}/{tot_lines}" + "\n") flog.write(f"Not printable qual lines: {qual_odd_chars}/{tot_lines}" + "\n") flog.write(f"Fixed header lines: {head_at}/{tot_lines}" + "\n") flog.write(f"Fixed + lines: {head_plus}/{tot_lines}" + "\n") @@ -181,72 +182,60 @@ def print_log_to_file(log_out): def print_log_to_screen(): global tot_lines, clean_reads, seq_len_neq_qual_len, head_print, seq_odd_chars, plus_row, qual_odd_chars, unexpected_line - click.echo( - click.style( - f"Clean lines: {clean_reads*4}/{tot_lines} ({round((clean_reads*4 / tot_lines) * 100, 2)}%)", - fg="blue" if tot_lines == clean_reads else "yellow", - ) - ) - click.echo( - click.style( - f"Len(SEQ) neq Len(QUAL): {seq_len_neq_qual_len}/{tot_lines}", - fg="blue" if seq_len_neq_qual_len == 0 else "red", - ) - ) - click.echo( - click.style( - f"BAD QUAL lines: {qual_out_range}/{tot_lines}", - fg="blue" if qual_out_range == 0 else "red", - ) - ) - click.echo( - click.style( - f"BAD '+' lines: {plus_row}/{tot_lines}", - fg="blue" if plus_row == 0 else "red", - ) - ) - click.echo( - click.style( - f"BAD SEQ lines: {seq_odd_chars}/{tot_lines}", - fg="blue" if seq_odd_chars == 0 else "red", - ) - ) - click.echo( - click.style( - f"Not printable or uncompliant header lines: {head_print}/{tot_lines}", - fg="blue" if head_print == 0 else "red", - ) - ) - click.echo( - click.style( - f"Not printable qual lines: {qual_odd_chars}/{tot_lines}", - fg="blue" if qual_odd_chars == 0 else "red", - ) - ) - click.echo( - click.style( - f"Fixed header lines: {head_at}/{tot_lines}", - fg="blue" if head_at == 0 else "yellow", - ) - ) - click.echo( - click.style( - f"Fixed + lines: {head_plus}/{tot_lines}", - fg="blue" if head_plus == 0 else "yellow", - ) - ) - click.echo( - click.style( - f"Blank lines: {blank}/{tot_lines}", - fg="blue" if blank == 0 else "yellow", - ) - ) - click.echo( - click.style( - f"Missplaced lines: {unexpected_line}/{tot_lines}", - fg="blue" if unexpected_line == 0 else "yellow", - ) - ) + if tot_lines == clean_reads: + logging.info(f"Clean lines: {clean_reads*4}/{tot_lines} ({round((clean_reads*4 / tot_lines) * 100, 2)}%)") + else: + logging.warning(f"Clean lines: {clean_reads * 4}/{tot_lines} ({round((clean_reads * 4 / tot_lines) * 100, 2)}%)") + + if seq_len_neq_qual_len == 0: + logging.info(f"Len(SEQ) neq Len(QUAL): {seq_len_neq_qual_len}/{tot_lines}") + else: + logging.warning(f"Len(SEQ) neq Len(QUAL): {seq_len_neq_qual_len}/{tot_lines}") + + if qual_out_range == 0: + logging.info(f"BAD QUAL lines: {qual_out_range}/{tot_lines}") + else: + logging.warning(f"BAD QUAL lines: {qual_out_range}/{tot_lines}") + + if plus_row == 0: + logging.info(f"BAD '+' lines: {plus_row}/{tot_lines}") + else: + logging.warning(f"BAD '+' lines: {plus_row}/{tot_lines}") + + if seq_odd_chars == 0: + logging.info(f"BAD SEQ lines: {seq_odd_chars}/{tot_lines}") + else: + logging.warning(f"BAD SEQ lines: {seq_odd_chars}/{tot_lines}") + + if head_print == 0: + logging.info(f"Not printable or uncompliant header lines: {head_print}/{tot_lines}") + else: + logging.warning(f"Not printable or uncompliant header lines: {head_print}/{tot_lines}") + + if qual_odd_chars == 0: + logging.info(f"Not printable qual lines: {qual_odd_chars}/{tot_lines}") + else: + logging.warning(f"Not printable qual lines: {qual_odd_chars}/{tot_lines}") + + if head_at == 0: + logging.info(f"Fixed header lines: {head_at}/{tot_lines}") + else: + logging.warning(f"Fixed header lines: {head_at}/{tot_lines}") + + if head_plus == 0: + logging.info(f"Fixed + lines: {head_plus}/{tot_lines}") + else: + logging.warning(f"Fixed + lines: {head_plus}/{tot_lines}") + + if blank == 0: + logging.info(f"Blank lines: {blank}/{tot_lines}") + else: + logging.warning(f"Blank lines: {blank}/{tot_lines}") + + if unexpected_line == 0: + logging.info(f"Missplaced lines: {unexpected_line}/{tot_lines}") + else: + logging.warning(f"Missplaced lines: {unexpected_line}/{tot_lines}") def reset_flags(checkpoint_flags: dict): @@ -261,43 +250,19 @@ def count_remainder(): return tot_lines % 4 -@click.command() -@click.option( - "--fastq_in", type=click.STRING, required=True, help="The input FASTQ file" -) -@click.option( - "--fastq_out", type=click.STRING, required=True, help="The wiped FASTQ file" -) -@click.option( - "--log_out", - type=click.STRING, - required=False, - help="The file name of the final quality report summary", -) -@click.option( - "--log_frequency", - type=click.INT, - default=500000, - help="The number of reads you want to print a status message", -) def wipe_fastq(fastq_in: str, fastq_out: str, log_out: str, log_frequency: int): # MIN_HEADER_LEN = 10 fin = open_fastq_file(fastq_in) if not fin: - click.echo( - click.style( - "The input FASTQ file does not exist or bad extension (.gz or .fastq.gz)", - fg="red", - ) - ) + logging.critical("The input FASTQ file does not exist or bad extension (.gz or .fastq.gz)") else: - click.echo(click.style(f"Start wiping {fastq_in}", fg="blue")) + logging.info(f"Start wiping {fastq_in}") # Open file out stream fout = write_fastq_file(fastq_out) - # change if a lines is parsed succesfully + # change if a lines is parsed successfully checkpoint_flags = { "at": False, "seq": False, @@ -423,7 +388,7 @@ def wipe_fastq(fastq_in: str, fastq_out: str, log_out: str, log_frequency: int): fin.close() # Print short report - click.echo(click.style("Successfully terminated\n", fg="blue")) + logging.info("Successfully terminated\n") if log_out: print_log_to_file(log_out) @@ -432,4 +397,11 @@ def wipe_fastq(fastq_in: str, fastq_out: str, log_out: str, log_frequency: int): if __name__ == "__main__": - wipe_fastq() + parser = argparse.ArgumentParser(description='FastqWiper entrypoint') + parser.add_argument("-i", '--fastq_in', help='The corrupted FASTQ file', required=True) + parser.add_argument("-o", '--fastq_out', help='The wiped FASTQ file', required=True) + parser.add_argument("-l", '--log_out', nargs='?', help='The file name of the final quality report summary') + parser.add_argument("-f", '--log_frequency', type=int, nargs='?', const=500000, help='The number of reads you want to print a status message') + args = parser.parse_args() + + wipe_fastq(args.fastq_in, args.fastq_out,args.log_out, args.log_frequency) diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 12630ee..0000000 --- a/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -colorama=0.4.4 -click=7.1.2 -setuptools~=51.0.0 -mamba=0.12.2 \ No newline at end of file From 6cc94abe8e0afb645e737d67e334950fca5797af Mon Sep 17 00:00:00 2001 From: mazzalab Date: Tue, 21 May 2024 07:56:02 +0200 Subject: [PATCH 2/5] Fixed fix_header_line method. Working on "unexpected_lines" count --- .github/workflows/code_coverage.yml | 2 +- .github/workflows/conda_reusable.yml | 2 +- .github/workflows/pypi_reusable.yml | 2 +- Dockerfile | 7 +++---- README.md | 21 ++++++++++----------- Singularity.def | 7 +++---- conda-recipe/meta.yaml | 4 +--- fastq_wiper/wiper.py | 18 +++++------------- setup.py | 15 ++++++++------- 9 files changed, 33 insertions(+), 45 deletions(-) diff --git a/.github/workflows/code_coverage.yml b/.github/workflows/code_coverage.yml index 2b4ed59..a8ab01c 100644 --- a/.github/workflows/code_coverage.yml +++ b/.github/workflows/code_coverage.yml @@ -11,7 +11,7 @@ jobs: env: OS: ${{ matrix.os }} - PYTHON: '3.10' + PYTHON: '3.11' name: Test FastqWiper steps: diff --git a/.github/workflows/conda_reusable.yml b/.github/workflows/conda_reusable.yml index 772541e..2da0597 100644 --- a/.github/workflows/conda_reusable.yml +++ b/.github/workflows/conda_reusable.yml @@ -18,7 +18,7 @@ jobs: max-parallel: 5 matrix: os: ["windows-latest", "ubuntu-latest"] # , "macos-latest"", - python-version: ["3.7", "3.8", "3.9", "3.10"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/pypi_reusable.yml b/.github/workflows/pypi_reusable.yml index 0dc7f87..5ad8e7a 100644 --- a/.github/workflows/pypi_reusable.yml +++ b/.github/workflows/pypi_reusable.yml @@ -18,7 +18,7 @@ jobs: max-parallel: 5 matrix: os: ["ubuntu-latest"] - python-version: ["3.10"] + python-version: ["3.11"] steps: - name: Install Miniconda diff --git a/Dockerfile b/Dockerfile index 1c929b3..9a3c924 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,13 +1,12 @@ FROM condaforge/mambaforge LABEL maintainer="mazza.tommaso@gmail.com" -ENV bbmap_version 39.01 +ENV bbmap_version 39.06 ENV PATH "$PATH:/tmp/jre1.8.0_161/bin/" # RUN mamba config --set channel_priority strict -RUN mamba install python=3.10 -RUN mamba install -c conda-forge -c bioconda snakemake=7.32.3 -y -RUN mamba install -c conda-forge colorama click -y +RUN mamba install python=3.11 +RUN mamba install -c conda-forge -c bioconda snakemake=8.11.3 -y RUN mamba install -c bioconda trimmomatic -y # Install fastqwiper from conda diff --git a/README.md b/README.md index c33cc33..c51c068 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ `FastqWiper` is a Snakemake-enabled application that wipes out bad reads from broken FASTQ files. Additionally, the available and pre-designed Snakemake [workflows](https://github.com/mazzalab/fastqwiper/tree/main/pipeline) allows **recovering** corrupted `fastq.gz`, **dropping** or **fixing** pesky lines, **removing** unpaired reads, and **fixing** reads interleaving. -* Compatibility: Python ≥3.7, <3.11 +* Compatibility: Python ≥3.7, <3.13 * OS: Windows, Linux, Mac OS (Snakemake workflows through Docker for Windows) * Contributions: [bioinformatics@css-mendel.it](bioinformatics@css-mendel.it) * Docker: https://hub.docker.com/r/mazzalab/fastqwiper @@ -30,7 +30,7 @@ This requires you to install FastqWiper and therefore does not require you to co #### Use Conda ``` -conda create -n fastqwiper python=3.10 +conda create -n fastqwiper python=3.11 conda activate fastqwiper conda install -c bfxcss -c conda-forge fastqwiper @@ -77,13 +77,13 @@ CMD: `docker run --rm -ti --name fastqwiper -v "YOUR_LOCAL_PATH_TO_DATA_FOLDER:/ `singularity pull library://mazzalab/fastqwiper/fastqwiper.sif` -2. Once downloaded the image (e.g., fastqwiper.sif_2023.2.70.sif), type: +2. Once downloaded the image (e.g., fastqwiper.sif_2024.1.89.sif), type: -CMD `singularity run --bind /scratch/tom/fastqwiper_singularity/data:/fastqwiper/data --writable-tmpfs fastqwiper.sif_2023.2.70.sif paired 8 sample 50000000 33` +CMD `singularity run --bind YOUR_LOCAL_PATH_TO_DATA_FOLDER:/fastqwiper/data --writable-tmpfs fastqwiper.sif_2024.1.89.sif paired 8 sample 50000000 33` If you want to bind the `.singularity` cache folder and the `logs` folder, you can omit `--writable-tmpfs`, create the folders `.singularity` and `logs` (`mkdir .singularity logs`) on the host system, and use this command instead: -CMD: `singularity run --bind YOUR_LOCAL_PATH_TO_DATA_FOLDER/:/fastqwiper/data --bind YOUR_LOCAL_PATH_TO_.singularity_FOLDER/:/fastqwiper/.snakemake --bind YOUR_LOCAL_PATH_TO_LOGS_FOLDER/:/fastqwiper/logs fastqwiper.sif_2023.2.70.sif paired 8 sample 50000000 33` +CMD: `singularity run --bind YOUR_LOCAL_PATH_TO_DATA_FOLDER/:/fastqwiper/data --bind YOUR_LOCAL_PATH_TO_.SNAKEMAKE_FOLDER/:/fastqwiper/.snakemake --bind YOUR_LOCAL_PATH_TO_LOGS_FOLDER/:/fastqwiper/logs fastqwiper.sif_2024.1.89.sif paired 8 sample 50000000 33` For both **Docker** and **Singularity**: @@ -106,14 +106,13 @@ if you have anaconda/miniconda already installed, or directly installing `Mambaf Then, create and activate a clean environment as above: ``` -mamba create -n fastqwiper python=3.10 +mamba create -n fastqwiper python=3.11 mamba activate fastqwiper ``` -Finally, install a few dependencies: +Finally, install the Snakemake dependency: ``` $ mamba install -c bioconda snakemake -$ mamba install colorama click ``` @@ -153,7 +152,7 @@ Copy the fastq files you want to fix in the `data` folder. - **Run the pipeline** (n.b., during the first execution, Snakemake will download and install some required remote packages and may take longer). The number of computing cores can be tuned accordingly:
-`snakemake --config sample_name=my_sample qin=33 -s pipeline/fix_wipe_single_reads_sequential.smk --use-conda --cores 2` +`snakemake --config sample_name=my_sample -s pipeline/fix_wipe_single_reads_sequential.smk --use-conda --cores 2` Fixed files will be copied in the `data` folder and will be suffixed with the string `_fixed_wiped_paired_interleaving`. We remind that the `fix_wipe_pairs_reads_sequential.smk` and `fix_wipe_pairs_reads_parallel.smk` pipelines perform the following actions: @@ -177,12 +176,12 @@ We remind that the `fix_wipe_pairs_reads_sequential.smk` and `fix_wipe_pairs_rea # Author **Tommaso Mazza** -[![Tweeting](https://img.shields.io/twitter/url/http/shields.io.svg?style=social)](https://twitter.com/irongraft) +[![X](https://img.shields.io/badge/X-%23000000.svg?style=for-the-badge&logo=X&logoColor=white)](https://twitter.com/irongraft) [![LinkedIn](https://img.shields.io/badge/linkedin-%230077B5.svg?style=for-the-badge&logo=linkedin&logoColor=white)](https://www.linkedin.com/in/tommasomazza/) Laboratory of Bioinformatics
Fondazione IRCCS Casa Sollievo della Sofferenza
Viale Regina Margherita 261 - 00198 Roma IT
Tel: +39 06 44160526 - Fax: +39 06 44160548
-E-mail: t.mazza@css-mendel.it
+E-mail: t.mazza@operapadrepio.it
Web page: http://www.css-mendel.it
Web page: http://bioinformatics.css-mendel.it
diff --git a/Singularity.def b/Singularity.def index 7a67a6c..046e747 100644 --- a/Singularity.def +++ b/Singularity.def @@ -13,9 +13,8 @@ From: condaforge/mambaforge PATH=$PATH:/tmp/jre1.8.0_161/bin/ %post - mamba install python=3.10 - mamba install -c conda-forge -c bioconda snakemake=7.32.3 -y - mamba install -c conda-forge colorama click -y + mamba install python=3.11 + mamba install -c conda-forge -c bioconda snakemake=8.11.3 -y mamba install -c bioconda trimmomatic -y mamba install -y -c bfxcss -c conda-forge fastqwiper @@ -24,7 +23,7 @@ From: condaforge/mambaforge apt-get install gzrt -y # Software versions - BBMAP_VER="39.01" + BBMAP_VER="39.06" wget -c https://sourceforge.net/projects/bbmap/files/BBMap_$BBMAP_VER.tar.gz/download -O /fastqwiper/BBMap_$BBMAP_VER.tar.gz cd fastqwiper diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 51aa648..108e4d0 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -28,8 +28,6 @@ requirements: run: - python - - colorama - - click test: imports: @@ -39,6 +37,6 @@ test: about: home: https://github.com/mazzalab/fastqwiper - summary: An ensamble method to recover corrupted FASTQ files, drop or fix pesky lines, remove unpaired reads, and fix reads interleaving + summary: An ensemble method to recover corrupted FASTQ files, drop or fix pesky lines, remove unpaired reads, and fix reads interleaving license: MIT license_file: LICENSE.txt diff --git a/fastq_wiper/wiper.py b/fastq_wiper/wiper.py index 91069b6..c532502 100644 --- a/fastq_wiper/wiper.py +++ b/fastq_wiper/wiper.py @@ -69,22 +69,14 @@ def write_fastq_file(file_path: str): def fix_header_line(line: str, checkpoint_flags: dict) -> str: header: str = line.rstrip() - # TODO: FIX this method if header.startswith("@") and len(header) > 1: checkpoint_flags["at"] = True - else: - if header.startswith("@") and len(header) == 1: - checkpoint_flags["at"] = False - else: - # Drop all character preceding the last @ character of the header - if header.rfind("@") > 0: - header = header[header.rfind("@"):] - checkpoint_flags["at"] = True + elif header.rfind("@") > 0: + header = header[header.rfind("@"):] + checkpoint_flags["at"] = True - global head_at - head_at += 1 - else: - checkpoint_flags["at"] = False + global head_at + head_at += 1 return header diff --git a/setup.py b/setup.py index 77f9c3d..db67984 100644 --- a/setup.py +++ b/setup.py @@ -21,10 +21,8 @@ from setuptools import setup, find_packages print(sys.version_info) -if sys.version_info.major != 3 and ( - sys.version_info.minor < 7 or sys.version_info.minor > 10 -): - sys.exit("Sorry, Python < 3.7 and > 3.10 are not supported") +if sys.version_info.major != 3 or sys.version_info.minor < 7: + sys.exit("Sorry, Python < 3.7 is not supported") with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() @@ -41,7 +39,8 @@ version=VERSION, author="Tommaso Mazza", author_email="bioinformatics@css-mendel.it", - description="An ensamble method to recover corrupted FASTQ files, drop or fix pesky lines, remove unpaired reads, and fix reads interleaving", + description="An ensemble method to recover corrupted FASTQ files, drop or fix pesky lines, remove unpaired reads, " + "and fix reads interleaving", url="https://github.com/mazzalab/fastqwiper", packages=find_packages(), include_package_data=True, @@ -49,7 +48,7 @@ long_description=long_description, long_description_content_type="text/markdown", entry_points={"console_scripts": ["fastqwiper = fastq_wiper.wiper:wipe_fastq"]}, - install_requires=["setuptools", "colorama", "click"], + install_requires=["setuptools"], classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Science/Research", @@ -57,6 +56,8 @@ "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ], @@ -67,5 +68,5 @@ "Developmental plan": "https://github.com/mazzalab/fastqwiper/projects", }, keywords="genomics, ngs, fastq, bioinformatics", - python_requires=">=3.7,<3.11", + python_requires=">=3.7,<3.13", ) From 14ea4cf8aa898687067092f59d51ac88f580f290 Mon Sep 17 00:00:00 2001 From: mazzalab Date: Fri, 24 May 2024 08:19:53 +0200 Subject: [PATCH 3/5] Rewriting wipe_fastq method - still unstable --- fastq_wiper/wiper.py | 269 ++++++++++++++++++++++++++++--------------- 1 file changed, 178 insertions(+), 91 deletions(-) diff --git a/fastq_wiper/wiper.py b/fastq_wiper/wiper.py index c532502..09909a4 100644 --- a/fastq_wiper/wiper.py +++ b/fastq_wiper/wiper.py @@ -12,18 +12,18 @@ logging.basicConfig(level=logging.DEBUG) # region Variables for final report +blank: int = 0 tot_lines: int = 0 +bad_seq: int = 0 +fixed_header: int = 0 +bad_header: int = 0 +bad_plus: int = 0 +fixed_plus: int = 0 +bad_qual: int = 0 +qual_out_of_range: int = 0 clean_reads: int = 0 seq_len_neq_qual_len: int = 0 -qual_out_range: int = 0 -plus_row: int = 0 -seq_odd_chars: int = 0 -qual_odd_chars: int = 0 -head_print: int = 0 -head_at: int = 0 -blank: int = 0 -head_plus: int = 0 -unexpected_line: int = 0 +orphan_lines: int = 0 # endregion # Allowed character of the SEQ line @@ -49,6 +49,19 @@ def open_fastq_file(file_path: str): return fastq_file_handler +def read_next_line(fin, log_frequency: int): + global tot_lines + + line = fin.readline() + # increment the total number of read not-empty lines + tot_lines += 1 + + line = skip_empty_lines(line, fin) + print_log_during_reading(tot_lines, log_frequency) + + return line + + def write_fastq_file(file_path: str): fastq_file_handler = None @@ -66,70 +79,82 @@ def write_fastq_file(file_path: str): return fastq_file_handler -def fix_header_line(line: str, checkpoint_flags: dict) -> str: - header: str = line.rstrip() - - if header.startswith("@") and len(header) > 1: - checkpoint_flags["at"] = True - elif header.rfind("@") > 0: +def check_header_line(line: str) -> str: + global fixed_header, bad_header + header: str = "" + + if line.isprintable() and line.rfind("@") == 0: + header = line.rstrip() + elif not line.isprintable(): + # This is a not printable header + bad_header += 1 + elif "@" not in line: + # This is an uncompliant header line + bad_header += 1 + else: # elif header.rfind("@") > 0: + header = line.rstrip() header = header[header.rfind("@"):] - checkpoint_flags["at"] = True - - global head_at - head_at += 1 + fixed_header += 1 return header -def fix_seq_line(line: str, checkpoint_flags: dict) -> str: +def check_seq_line(line: str) -> str: + global bad_seq raw_seq: str = "" - if reg.match(line): - checkpoint_flags["seq"] = True + if not line.isprintable() or not reg.match(line): + bad_seq += 1 + else: raw_seq = line return raw_seq -def fix_plus_line(line: str, checkpoint_flags: dict) -> str: - checkpoint_flags["plus"] = True +def check_plus_line(line: str) -> str: + global bad_plus, fixed_plus + plus: str = "" - if line != "+": - # Drop all characters except + of the qual line separator - line = "+" - - global head_plus - head_plus += 1 + if not line.isprintable() or line.find("+") == -1: + bad_plus += 1 + elif line.find("+") > 0: + # Drop all characters except '+' character + plus = '+' + fixed_plus += 1 + else: + plus = line - return line + return plus -def fix_qual_line(line: str, checkpoint_flags: dict) -> str: +def check_qual_line(line: str) -> str: + global qual_out_of_range, bad_qual qual: str = "" - if(line): + if not line.isprintable(): + bad_qual += 1 + + if line: min_ascii = min(ord(c) for c in line) max_ascii = max(ord(c) for c in line) if min_ascii >= 33 and max_ascii <= 126: - checkpoint_flags["qual"] = True - qual: str = line + qual = line else: - global qual_out_range - qual_out_range += 1 + qual_out_of_range += 1 return qual def skip_empty_lines(line: str, fin) -> str: + global blank + while True: if line: line = line.rstrip() break else: - global blank blank += 1 line = fin.readline() - continue return line @@ -147,7 +172,8 @@ def print_to_file(header, raw_seq, head_qual_sep, qual, fout): def print_log_to_file(log_out): - global tot_lines, clean_reads, seq_len_neq_qual_len, head_print, seq_odd_chars, plus_row, qual_odd_chars, unexpected_line + global tot_lines, clean_reads, seq_len_neq_qual_len, bad_qual, qual_out_of_range, bad_plus, \ + bad_seq, bad_header, fixed_header, fixed_plus, blank, orphan_lines # Open file out summary log flog = open(log_out, "wt", encoding="utf-8") @@ -157,84 +183,79 @@ def print_log_to_file(log_out): f"Clean lines: {clean_reads*4}/{tot_lines} ({round((clean_reads*4 / tot_lines) * 100, 2)}%)" + "\n" ) + flog.write(f"Not printable or uncompliant header lines: {bad_header}/{tot_lines}" + "\n") + flog.write(f"Fixed header lines: {fixed_header}/{tot_lines}" + "\n") + flog.write(f"BAD SEQ lines: {bad_seq}/{tot_lines}" + "\n") + flog.write(f"BAD '+' lines: {bad_plus}/{tot_lines}" + "\n") + flog.write(f"Fixed + lines: {fixed_plus}/{tot_lines}" + "\n") + flog.write(f"BAD QUAL lines: {bad_qual}/{tot_lines}" + "\n") + flog.write(f"QUAL out of range lines: {qual_out_of_range}/{tot_lines}" + "\n") flog.write(f"Len(SEQ) neq Len(QUAL): {seq_len_neq_qual_len}/{tot_lines}" + "\n") - flog.write(f"BAD QUAL lines: {qual_out_range}/{tot_lines}" + "\n") - flog.write(f"BAD '+' lines: {plus_row}/{tot_lines}" + "\n") - flog.write(f"BAD SEQ lines: {seq_odd_chars}/{tot_lines}" + "\n") - flog.write(f"Not printable or uncompliant header lines: {head_print}/{tot_lines}" + "\n") - flog.write(f"Not printable qual lines: {qual_odd_chars}/{tot_lines}" + "\n") - flog.write(f"Fixed header lines: {head_at}/{tot_lines}" + "\n") - flog.write(f"Fixed + lines: {head_plus}/{tot_lines}" + "\n") flog.write(f"Blank lines: {blank}/{tot_lines}" + "\n") - flog.write(f"Missplaced lines: {unexpected_line}/{tot_lines}" + "\n") + flog.write(f"Orphan lines: {orphan_lines}/{tot_lines}" + "\n") flog.close() def print_log_to_screen(): - global tot_lines, clean_reads, seq_len_neq_qual_len, head_print, seq_odd_chars, plus_row, qual_odd_chars, unexpected_line + global tot_lines, clean_reads, seq_len_neq_qual_len, bad_qual, qual_out_of_range, bad_plus, \ + bad_seq, bad_header, fixed_header, fixed_plus, blank, orphan_lines - if tot_lines == clean_reads: - logging.info(f"Clean lines: {clean_reads*4}/{tot_lines} ({round((clean_reads*4 / tot_lines) * 100, 2)}%)") + if tot_lines == (clean_reads*4): + logging.info(f"Clean lines: {clean_reads * 4}/{tot_lines} ({round((clean_reads * 4 / tot_lines) * 100, 2)}%)") else: - logging.warning(f"Clean lines: {clean_reads * 4}/{tot_lines} ({round((clean_reads * 4 / tot_lines) * 100, 2)}%)") + logging.warning( + f"Clean lines: {clean_reads * 4}/{tot_lines} ({round((clean_reads * 4 / tot_lines) * 100, 2)}%)") - if seq_len_neq_qual_len == 0: - logging.info(f"Len(SEQ) neq Len(QUAL): {seq_len_neq_qual_len}/{tot_lines}") + if bad_header == 0: + logging.info(f"Not printable or uncompliant header lines: {bad_header}/{tot_lines}") else: - logging.warning(f"Len(SEQ) neq Len(QUAL): {seq_len_neq_qual_len}/{tot_lines}") + logging.warning(f"Not printable or uncompliant header lines: {bad_header}/{tot_lines}") - if qual_out_range == 0: - logging.info(f"BAD QUAL lines: {qual_out_range}/{tot_lines}") + if fixed_header == 0: + logging.info(f"Fixed header lines: {fixed_header}/{tot_lines}") else: - logging.warning(f"BAD QUAL lines: {qual_out_range}/{tot_lines}") + logging.warning(f"Fixed header lines: {fixed_header}/{tot_lines}") - if plus_row == 0: - logging.info(f"BAD '+' lines: {plus_row}/{tot_lines}") + if bad_seq == 0: + logging.info(f"BAD SEQ lines: {bad_seq}/{tot_lines}") else: - logging.warning(f"BAD '+' lines: {plus_row}/{tot_lines}") + logging.warning(f"BAD SEQ lines: {bad_seq}/{tot_lines}") - if seq_odd_chars == 0: - logging.info(f"BAD SEQ lines: {seq_odd_chars}/{tot_lines}") + if bad_plus == 0: + logging.info(f"BAD '+' lines: {bad_plus}/{tot_lines}") else: - logging.warning(f"BAD SEQ lines: {seq_odd_chars}/{tot_lines}") + logging.warning(f"BAD '+' lines: {bad_plus}/{tot_lines}") - if head_print == 0: - logging.info(f"Not printable or uncompliant header lines: {head_print}/{tot_lines}") + if fixed_plus == 0: + logging.info(f"Fixed + lines: {fixed_plus}/{tot_lines}") else: - logging.warning(f"Not printable or uncompliant header lines: {head_print}/{tot_lines}") + logging.warning(f"Fixed + lines: {fixed_plus}/{tot_lines}") - if qual_odd_chars == 0: - logging.info(f"Not printable qual lines: {qual_odd_chars}/{tot_lines}") + if bad_qual == 0: + logging.info(f"BAD QUAL lines: {qual_out_of_range}/{tot_lines}") else: - logging.warning(f"Not printable qual lines: {qual_odd_chars}/{tot_lines}") + logging.warning(f"BAD QUAL lines: {qual_out_of_range}/{tot_lines}") - if head_at == 0: - logging.info(f"Fixed header lines: {head_at}/{tot_lines}") + if qual_out_of_range == 0: + logging.info(f"QUAL out of range lines: {qual_out_of_range}/{tot_lines}") else: - logging.warning(f"Fixed header lines: {head_at}/{tot_lines}") + logging.warning(f"QUAL out of range lines: {qual_out_of_range}/{tot_lines}") - if head_plus == 0: - logging.info(f"Fixed + lines: {head_plus}/{tot_lines}") + if seq_len_neq_qual_len == 0: + logging.info(f"Len(SEQ) neq Len(QUAL): {seq_len_neq_qual_len}/{tot_lines}") else: - logging.warning(f"Fixed + lines: {head_plus}/{tot_lines}") + logging.warning(f"Len(SEQ) neq Len(QUAL): {seq_len_neq_qual_len}/{tot_lines}") if blank == 0: logging.info(f"Blank lines: {blank}/{tot_lines}") else: logging.warning(f"Blank lines: {blank}/{tot_lines}") - if unexpected_line == 0: - logging.info(f"Missplaced lines: {unexpected_line}/{tot_lines}") + if orphan_lines == 0: + logging.info(f"Orphan lines: {orphan_lines}/{tot_lines}") else: - logging.warning(f"Missplaced lines: {unexpected_line}/{tot_lines}") - - -def reset_flags(checkpoint_flags: dict): - checkpoint_flags["at"] = False - checkpoint_flags["seq"] = False - checkpoint_flags["plus"] = False - checkpoint_flags["qual"] = False + logging.warning(f"Orphan lines: {orphan_lines}/{tot_lines}") def count_remainder(): @@ -242,7 +263,7 @@ def count_remainder(): return tot_lines % 4 -def wipe_fastq(fastq_in: str, fastq_out: str, log_out: str, log_frequency: int): +def wipe_fastq_old(fastq_in: str, fastq_out: str, log_out: str, log_frequency: int): # MIN_HEADER_LEN = 10 fin = open_fastq_file(fastq_in) @@ -269,7 +290,7 @@ def wipe_fastq(fastq_in: str, fastq_out: str, log_out: str, log_frequency: int): qual: str = "" # global variables anchor - global tot_lines, clean_reads, seq_len_neq_qual_len, head_print, seq_odd_chars, plus_row, qual_odd_chars, unexpected_line + global tot_lines, clean_reads, seq_len_neq_qual_len, head_print, bad_seq, plus_row, qual_odd_chars, unexpected_line # Loop through 4-line reads for line in fin: @@ -308,7 +329,7 @@ def wipe_fastq(fastq_in: str, fastq_out: str, log_out: str, log_frequency: int): and not checkpoint_flags["plus"] and not checkpoint_flags["qual"] ): - head_qual_sep = fix_plus_line(line, checkpoint_flags) + head_qual_sep = check_plus_line(line, checkpoint_flags) # qual line elif ( @@ -318,7 +339,7 @@ def wipe_fastq(fastq_in: str, fastq_out: str, log_out: str, log_frequency: int): and checkpoint_flags["plus"] and not checkpoint_flags["qual"] ): - qual = fix_qual_line(line, checkpoint_flags) + qual = check_qual_line(line, checkpoint_flags) # Eventually print to file if len(raw_seq) == len(qual) and len(qual) != 0: @@ -353,7 +374,7 @@ def wipe_fastq(fastq_in: str, fastq_out: str, log_out: str, log_frequency: int): and not checkpoint_flags["plus"] and not checkpoint_flags["qual"] ): - seq_odd_chars += 1 + bad_seq += 1 elif ( checkpoint_flags["at"] and checkpoint_flags["seq"] @@ -388,6 +409,72 @@ def wipe_fastq(fastq_in: str, fastq_out: str, log_out: str, log_frequency: int): print_log_to_screen() +def wipe_fastq(fastq_in: str, fastq_out: str, log_out: str, log_frequency: int): + # MIN_HEADER_LEN = 10 + + fin = open_fastq_file(fastq_in) + if not fin: + logging.critical("The input FASTQ file does not exist or bad extension (.gz or .fastq.gz)") + else: + logging.info(f"Start wiping {fastq_in}") + + # Open file out stream + fout = write_fastq_file(fastq_out) + + # global variables anchor + global tot_lines, clean_reads, seq_len_neq_qual_len, orphan_lines + + # Loop through 4-line reads + for line in fin: + tot_lines += 1 + print_log_during_reading(tot_lines, log_frequency) + line = skip_empty_lines(line, fin) + + # PROCESS THE HEADER LINE + header: str = check_header_line(line) + if not header: + continue + + # PROCESS THE SEQ LINE + line = read_next_line(fin, log_frequency) + raw_seq: str = check_seq_line(line) + if not raw_seq: + continue + + # PROCESS the + line + line = read_next_line(fin, log_frequency) + plus: str = check_plus_line(line) + if not plus: + continue + + # PROCESS the QUAL line + line = read_next_line(fin, log_frequency) + qual: str = check_qual_line(line) + if not qual: + continue + + # Eventually print to file + if len(raw_seq) == len(qual) and len(qual) != 0: + print_to_file(header, raw_seq, plus, qual, fout) + clean_reads += 1 + else: + seq_len_neq_qual_len += 1 + + # All lines that do not refer to any read at the end of the file are considered "unexpected" (1 to 3 lines max) + orphan_lines = count_remainder() + + fout.close() + fin.close() + + # Print short report + logging.info("Successfully terminated\n") + + if log_out: + print_log_to_file(log_out) + else: + print_log_to_screen() + + if __name__ == "__main__": parser = argparse.ArgumentParser(description='FastqWiper entrypoint') parser.add_argument("-i", '--fastq_in', help='The corrupted FASTQ file', required=True) From 84a976c86464bcdbe196866002472e615ce1ac4f Mon Sep 17 00:00:00 2001 From: mazzalab Date: Fri, 24 May 2024 15:13:42 +0200 Subject: [PATCH 4/5] Fixed wipe_fastq --- fastq_wiper/wiper.py | 217 +++++-------------------------------------- 1 file changed, 23 insertions(+), 194 deletions(-) diff --git a/fastq_wiper/wiper.py b/fastq_wiper/wiper.py index 09909a4..3242fbf 100644 --- a/fastq_wiper/wiper.py +++ b/fastq_wiper/wiper.py @@ -23,12 +23,8 @@ qual_out_of_range: int = 0 clean_reads: int = 0 seq_len_neq_qual_len: int = 0 -orphan_lines: int = 0 # endregion -# Allowed character of the SEQ line -reg = re.compile("^[ACGTN]+$") - def open_fastq_file(file_path: str): fastq_file_handler = None @@ -53,11 +49,10 @@ def read_next_line(fin, log_frequency: int): global tot_lines line = fin.readline() - # increment the total number of read not-empty lines - tot_lines += 1 - - line = skip_empty_lines(line, fin) - print_log_during_reading(tot_lines, log_frequency) + if line: + # increment the total number of read not-empty lines + tot_lines += 1 + print_log_during_reading(tot_lines, log_frequency) return line @@ -84,7 +79,7 @@ def check_header_line(line: str) -> str: header: str = "" if line.isprintable() and line.rfind("@") == 0: - header = line.rstrip() + header = line elif not line.isprintable(): # This is a not printable header bad_header += 1 @@ -92,7 +87,7 @@ def check_header_line(line: str) -> str: # This is an uncompliant header line bad_header += 1 else: # elif header.rfind("@") > 0: - header = line.rstrip() + header = line header = header[header.rfind("@"):] fixed_header += 1 @@ -145,20 +140,6 @@ def check_qual_line(line: str) -> str: return qual -def skip_empty_lines(line: str, fin) -> str: - global blank - - while True: - if line: - line = line.rstrip() - break - else: - blank += 1 - line = fin.readline() - - return line - - def print_log_during_reading(lines, log_frequency): if lines % log_frequency == 0: logging.info(f"Cleaned {lines} reads") @@ -173,7 +154,7 @@ def print_to_file(header, raw_seq, head_qual_sep, qual, fout): def print_log_to_file(log_out): global tot_lines, clean_reads, seq_len_neq_qual_len, bad_qual, qual_out_of_range, bad_plus, \ - bad_seq, bad_header, fixed_header, fixed_plus, blank, orphan_lines + bad_seq, bad_header, fixed_header, fixed_plus, blank # Open file out summary log flog = open(log_out, "wt", encoding="utf-8") @@ -192,14 +173,13 @@ def print_log_to_file(log_out): flog.write(f"QUAL out of range lines: {qual_out_of_range}/{tot_lines}" + "\n") flog.write(f"Len(SEQ) neq Len(QUAL): {seq_len_neq_qual_len}/{tot_lines}" + "\n") flog.write(f"Blank lines: {blank}/{tot_lines}" + "\n") - flog.write(f"Orphan lines: {orphan_lines}/{tot_lines}" + "\n") flog.close() def print_log_to_screen(): global tot_lines, clean_reads, seq_len_neq_qual_len, bad_qual, qual_out_of_range, bad_plus, \ - bad_seq, bad_header, fixed_header, fixed_plus, blank, orphan_lines + bad_seq, bad_header, fixed_header, fixed_plus, blank if tot_lines == (clean_reads*4): logging.info(f"Clean lines: {clean_reads * 4}/{tot_lines} ({round((clean_reads * 4 / tot_lines) * 100, 2)}%)") @@ -252,162 +232,6 @@ def print_log_to_screen(): else: logging.warning(f"Blank lines: {blank}/{tot_lines}") - if orphan_lines == 0: - logging.info(f"Orphan lines: {orphan_lines}/{tot_lines}") - else: - logging.warning(f"Orphan lines: {orphan_lines}/{tot_lines}") - - -def count_remainder(): - global tot_lines - return tot_lines % 4 - - -def wipe_fastq_old(fastq_in: str, fastq_out: str, log_out: str, log_frequency: int): - # MIN_HEADER_LEN = 10 - - fin = open_fastq_file(fastq_in) - if not fin: - logging.critical("The input FASTQ file does not exist or bad extension (.gz or .fastq.gz)") - else: - logging.info(f"Start wiping {fastq_in}") - - # Open file out stream - fout = write_fastq_file(fastq_out) - - # change if a lines is parsed successfully - checkpoint_flags = { - "at": False, - "seq": False, - "plus": False, - "qual": False, - } - - # Cleaned lines to be printed - header: str = "" - raw_seq: str = "" - head_qual_sep: str = "" - qual: str = "" - - # global variables anchor - global tot_lines, clean_reads, seq_len_neq_qual_len, head_print, bad_seq, plus_row, qual_odd_chars, unexpected_line - - # Loop through 4-line reads - for line in fin: - tot_lines += 1 # increment the total number of read lines - - print_log_during_reading(tot_lines, log_frequency) - line = skip_empty_lines(line, fin) - - # header line - if ( - line.isprintable() - and "@" in line - and not checkpoint_flags["at"] - and not checkpoint_flags["seq"] - and not checkpoint_flags["plus"] - and not checkpoint_flags["qual"] - ): - header = fix_header_line(line, checkpoint_flags) - - # seq line - elif ( - line.isprintable() - and checkpoint_flags["at"] - and not checkpoint_flags["seq"] - and not checkpoint_flags["plus"] - and not checkpoint_flags["qual"] - ): - raw_seq = fix_seq_line(line, checkpoint_flags) - - # + line - elif ( - "+" in line - and line.isprintable() - and checkpoint_flags["at"] - and checkpoint_flags["seq"] - and not checkpoint_flags["plus"] - and not checkpoint_flags["qual"] - ): - head_qual_sep = check_plus_line(line, checkpoint_flags) - - # qual line - elif ( - line.isprintable() - and checkpoint_flags["at"] - and checkpoint_flags["seq"] - and checkpoint_flags["plus"] - and not checkpoint_flags["qual"] - ): - qual = check_qual_line(line, checkpoint_flags) - - # Eventually print to file - if len(raw_seq) == len(qual) and len(qual) != 0: - print_to_file(header, raw_seq, head_qual_sep, qual, fout) - clean_reads += 1 - else: - seq_len_neq_qual_len += 1 - - reset_flags(checkpoint_flags) - - # unexpected line - else: - if ( - "@" in line - and not checkpoint_flags["at"] - and not checkpoint_flags["seq"] - and not checkpoint_flags["plus"] - and not checkpoint_flags["qual"] - ): - head_print += 1 - elif ( - "@" not in line - and not checkpoint_flags["at"] - and not checkpoint_flags["seq"] - and not checkpoint_flags["plus"] - and not checkpoint_flags["qual"] - ): - unexpected_line += 1 - elif ( - checkpoint_flags["at"] - and not checkpoint_flags["seq"] - and not checkpoint_flags["plus"] - and not checkpoint_flags["qual"] - ): - bad_seq += 1 - elif ( - checkpoint_flags["at"] - and checkpoint_flags["seq"] - and not checkpoint_flags["plus"] - and not checkpoint_flags["qual"] - ): - plus_row += 1 - elif ( - checkpoint_flags["at"] - and checkpoint_flags["seq"] - and checkpoint_flags["plus"] - and not checkpoint_flags["qual"] - ): - qual_odd_chars += 1 - else: - unexpected_line += 1 - - reset_flags(checkpoint_flags) - - # All lines that do not refer to any header at the end of the file are considered "unexpected" (1 to 3 lines max) - unexpected_line += count_remainder() - - fout.close() - fin.close() - - # Print short report - logging.info("Successfully terminated\n") - - if log_out: - print_log_to_file(log_out) - else: - print_log_to_screen() - def wipe_fastq(fastq_in: str, fastq_out: str, log_out: str, log_frequency: int): # MIN_HEADER_LEN = 10 @@ -422,34 +246,37 @@ def wipe_fastq(fastq_in: str, fastq_out: str, log_out: str, log_frequency: int): fout = write_fastq_file(fastq_out) # global variables anchor - global tot_lines, clean_reads, seq_len_neq_qual_len, orphan_lines + global tot_lines, clean_reads, seq_len_neq_qual_len # Loop through 4-line reads for line in fin: + if not line.strip(): + # skip empty lines + continue + tot_lines += 1 print_log_during_reading(tot_lines, log_frequency) - line = skip_empty_lines(line, fin) # PROCESS THE HEADER LINE - header: str = check_header_line(line) + header: str = check_header_line(line.rstrip()) if not header: continue # PROCESS THE SEQ LINE line = read_next_line(fin, log_frequency) - raw_seq: str = check_seq_line(line) + raw_seq: str = check_seq_line(line.rstrip()) if not raw_seq: continue # PROCESS the + line line = read_next_line(fin, log_frequency) - plus: str = check_plus_line(line) + plus: str = check_plus_line(line.rstrip()) if not plus: continue # PROCESS the QUAL line line = read_next_line(fin, log_frequency) - qual: str = check_qual_line(line) + qual: str = check_qual_line(line.rstrip()) if not qual: continue @@ -460,9 +287,6 @@ def wipe_fastq(fastq_in: str, fastq_out: str, log_out: str, log_frequency: int): else: seq_len_neq_qual_len += 1 - # All lines that do not refer to any read at the end of the file are considered "unexpected" (1 to 3 lines max) - orphan_lines = count_remainder() - fout.close() fin.close() @@ -480,7 +304,12 @@ def wipe_fastq(fastq_in: str, fastq_out: str, log_out: str, log_frequency: int): parser.add_argument("-i", '--fastq_in', help='The corrupted FASTQ file', required=True) parser.add_argument("-o", '--fastq_out', help='The wiped FASTQ file', required=True) parser.add_argument("-l", '--log_out', nargs='?', help='The file name of the final quality report summary') - parser.add_argument("-f", '--log_frequency', type=int, nargs='?', const=500000, help='The number of reads you want to print a status message') + parser.add_argument("-f", '--log_frequency', type=int, nargs='?', default=500000, const=500000, help='The number of reads you want to print a status message') + parser.add_argument("-a", '--alphabet', type=str, nargs='?', default="ACGTN", const="ACGTN", + help='Allowed character in the SEQ line. Default: ACGTN') args = parser.parse_args() + # Allowed character of the SEQ line + reg = re.compile(f"^[{args.alphabet}]+$") + wipe_fastq(args.fastq_in, args.fastq_out,args.log_out, args.log_frequency) From 08c68e0def446b3aa0237cdbfdb77b577243ed9d Mon Sep 17 00:00:00 2001 From: mazzalab Date: Sat, 25 May 2024 09:17:29 +0200 Subject: [PATCH 5/5] Rewritten wipe_fastq method and README.md --- Dockerfile | 6 +- README.md | 60 ++++++++++--------- Singularity.def | 4 +- fastq_wiper/wiper.py | 6 +- pipeline/fix_wipe_pairs_reads_parallel.smk | 4 +- pipeline/fix_wipe_pairs_reads_sequential.smk | 4 +- pipeline/fix_wipe_single_reads_parallel.smk | 4 +- pipeline/fix_wipe_single_reads_sequential.smk | 6 +- run_wiping.sh | 27 +++++++-- 9 files changed, 74 insertions(+), 47 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9a3c924..43cf604 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,9 +32,9 @@ RUN chmod +x run_wiping.sh ENTRYPOINT ["/fastqwiper/run_wiping.sh"] -# paired mode, 4 cores, sample name, #rows-per-chunk, ASCII offset (33=Sanger, 64=old Solexa) -CMD ["paired", "4", "sample", "50000000", "33"] +# paired mode, 4 cores, sample name, #rows-per-chunk, ASCII offset (33=Sanger, 64=old Solexa), alphabet (e.g., ACGTN), log frequency (500000) +CMD ["paired", "4", "sample", "50000000", "33", "ACGTN", "500000"] # docker build -t test . -# docker run --rm -ti --name test -v "D:\Projects\fastqwiper\data:/fastqwiper/data" test paired 4 sample 50000000 33 +# docker run --rm -ti --name test -v "D:\Projects\fastqwiper\data:/fastqwiper/data" test paired 4 sample 50000000 33 ACGTN 500000 # docker exec -ti test /bin/bash \ No newline at end of file diff --git a/README.md b/README.md index c51c068..28377db 100644 --- a/README.md +++ b/README.md @@ -7,10 +7,10 @@ [![Docker](https://badgen.net/badge/icon/docker?icon=docker&label)](https://hub.docker.com/r/mazzalab/fastqwiper) ![Docker Pulls](https://img.shields.io/docker/pulls/mazzalab/fastqwiper) -`FastqWiper` is a Snakemake-enabled application that wipes out bad reads from broken FASTQ files. Additionally, the available and pre-designed Snakemake [workflows](https://github.com/mazzalab/fastqwiper/tree/main/pipeline) allows **recovering** corrupted `fastq.gz`, **dropping** or **fixing** pesky lines, **removing** unpaired reads, and **fixing** reads interleaving. +`FastqWiper` is a Snakemake-enabled application that wipes out bad reads from broken FASTQ files. Additionally, the available and pre-designed Snakemake [workflows](https://github.com/mazzalab/fastqwiper/tree/main/pipeline) allows **recovering** corrupted `fastq.gz`, **dropping** or **fixing** pesky lines, **removing** unpaired reads, and **settling** reads interleaving. * Compatibility: Python ≥3.7, <3.13 -* OS: Windows, Linux, Mac OS (Snakemake workflows through Docker for Windows) +* OS: Windows, Linux, Mac OS (Snakemake workflows only through Docker for Windows) * Contributions: [bioinformatics@css-mendel.it](bioinformatics@css-mendel.it) * Docker: https://hub.docker.com/r/mazzalab/fastqwiper * Singularity: https://cloud.sylabs.io/library/mazzalab/fastqwiper/fastqwiper.sif @@ -18,14 +18,14 @@ ## USAGE -- **Case 1**. You have one or a couple (R1&R2) of **computer readable** FASTQ files which contain pesky, unformatted, uncompliant lines: Use *FastWiper* to clean them; -- **Case 2**. You have one or a couple (R1&R2) of **computer readable** FASTQ files that you want to drop unpaired reads from or fix reads interleaving: Use the FastqWiper's *Snakemake workflows*; -- **Case 3**. You have one `fastq.gz` file or a couple (R1&R2) of `fastq.gz` files which are corrupted (**unreadable**) and you want to recover healthy reads and reformat them: Use the FastqWiper's *Snakemake workflows*; +- **Case 1.**You have one or a couple (R1&R2) of **computer readable** (meaning that the .gz files can be successfully decompressed or that the .fa/.fasta files can be viewed from the beginning to the EOF) FASTQ files which contain pesky, unformatted, uncompliant lines: Use *FastWiper* to clean them; +- **Case 2.**You have one or a couple (R1&R2) of **computer readable** FASTQ files that you want to drop unpaired reads from or fix reads interleaving: Use the FastqWiper's *Snakemake workflows*; +- **Case 3.**You have one `fastq.gz` file or a couple (R1&R2) of `fastq.gz` files which are corrupted (**unreadable**, meaning that the .gz files cannot be successfully decompressed) and you want to recover healthy reads and reformat them: Use the FastqWiper's *Snakemake workflows*; ## Installation -### Case 1 -This requires you to install FastqWiper and therefore does not require you to configure *workflows* also. You can do it for all OSs: +### Case 1 +This requires you to install FastqWiper and therefore not to use *workflows*. You can do it for all OSs: #### Use Conda @@ -50,16 +50,17 @@ fastqwiper --help `fastqwiper ` ``` options: - --fastq_in TEXT The input FASTQ file to be cleaned [required] - --fastq_out TEXT The wiped FASTQ file [required] - --log_frequency INTEGER The number of reads you want to print a status message - --log_out TEXT The file name of the final quality report summary - --help Show this message and exit. + -i, --fastq_in TEXT The input FASTQ file to be cleaned [required] + -o, --fastq_out TEXT The wiped FASTQ file [required] + -l, --log_frequency INTEGER The number of reads you want to print a status message. Default: 500000 + -f, --log_out TEXT The file name of the final quality report summary. Print on the screen if not specified + -a, --alphabet Allowed character in the SEQ line. Default: ACGTN + -h, --help Show this message and exit. ``` -It accepts in input and outputs **readable** `*.fastq` or `*.fastq.gz` files. +It accepts strictly **readable** `*.fastq` or `*.fastq.gz` files in input. -### Cases 2 & 3 +### Case 2 & Case 3 There are QUICK and a SLOW methods to configure `FastqWiper`'s workflows. @@ -70,7 +71,7 @@ There are QUICK and a SLOW methods to configure `FastqWiper`'s wor 2. Once downloaded the image, type: -CMD: `docker run --rm -ti --name fastqwiper -v "YOUR_LOCAL_PATH_TO_DATA_FOLDER:/fastqwiper/data" mazzalab/fastqwiper paired 8 sample 50000000 33` +CMD: `docker run --rm -ti --name fastqwiper -v "YOUR_LOCAL_PATH_TO_DATA_FOLDER:/fastqwiper/data" mazzalab/fastqwiper paired 8 sample 50000000 33 ACGTN` #### Another quick way (Singularity) 1. Pull the Singularity image from the Cloud Library: @@ -79,11 +80,11 @@ CMD: `docker run --rm -ti --name fastqwiper -v "YOUR_LOCAL_PATH_TO_DATA_FOLDER:/ 2. Once downloaded the image (e.g., fastqwiper.sif_2024.1.89.sif), type: -CMD `singularity run --bind YOUR_LOCAL_PATH_TO_DATA_FOLDER:/fastqwiper/data --writable-tmpfs fastqwiper.sif_2024.1.89.sif paired 8 sample 50000000 33` +CMD `singularity run --bind YOUR_LOCAL_PATH_TO_DATA_FOLDER:/fastqwiper/data --writable-tmpfs fastqwiper.sif_2024.1.89.sif paired 8 sample 50000000 33 ACGTN` If you want to bind the `.singularity` cache folder and the `logs` folder, you can omit `--writable-tmpfs`, create the folders `.singularity` and `logs` (`mkdir .singularity logs`) on the host system, and use this command instead: -CMD: `singularity run --bind YOUR_LOCAL_PATH_TO_DATA_FOLDER/:/fastqwiper/data --bind YOUR_LOCAL_PATH_TO_.SNAKEMAKE_FOLDER/:/fastqwiper/.snakemake --bind YOUR_LOCAL_PATH_TO_LOGS_FOLDER/:/fastqwiper/logs fastqwiper.sif_2024.1.89.sif paired 8 sample 50000000 33` +CMD: `singularity run --bind YOUR_LOCAL_PATH_TO_DATA_FOLDER/:/fastqwiper/data --bind YOUR_LOCAL_PATH_TO_.SNAKEMAKE_FOLDER/:/fastqwiper/.snakemake --bind YOUR_LOCAL_PATH_TO_LOGS_FOLDER/:/fastqwiper/logs fastqwiper.sif_2024.1.89.sif paired 8 sample 50000000 33 ACGTN` For both **Docker** and **Singularity**: @@ -92,9 +93,10 @@ For both **Docker** and **Singularity**: - `8` is the number of your choice of computing cores to be spawned; - `sample` is part of the names of the FASTQ files to be wiped. Be aware that: for paired-end files (e.g., "sample_R1.fastq.gz" and "sample_R2.fastq.gz"), your files must finish with `_R1.fastq.gz` and `_R2.fastq.gz`. Therefore, the argument to pass is everything before these texts: `sample` in this case. For single end/individual files (e.g., "excerpt_R1_001.fastq.gz"), your file must end with the string `.fastq.gz`; the preceding text, i.e., "excerpt_R1_001" in this case, will be the text to be passed to the command as an argument. - `50000000` (optional) is the number of rows-per-chunk (used when cores>1. It must be a number multiple of 4). Increasing this number too much would reduce the parallelism advantage. Decreasing this number too much would increase the number of chunks more than the number of available cpus, making parallelism unefficient. Choose this number wisely depending on the total number of reads in your starting file. -- `33` (optional) is the ASCII offset (33=Sanger, 64=old Solexa) +- `33` (optional) is the ASCII offset (33=Sanger, 64=old Solexa) +- `ACGTN` (optional) is the allowed alphabet in the SEQ line of the FASTQ file -#### The slow way (Linux & Mac OS) +### The slow way (Linux & Mac OS) To enable the use of preconfigured [pipelines](https://github.com/mazzalab/fastqwiper/tree/main/pipeline), you need to install **Snakemake**. The recommended way to install Snakemake is via Conda, because it enables **Snakemake** to [handle software dependencies of your workflow](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#integrated-package-management). However, the default conda solver is slow and often hangs. Therefore, we recommend installing [Mamba](https://github.com/mamba-org/mamba) as a drop-in replacement via @@ -125,12 +127,12 @@ cd fastqwiper ``` It contains, in particular, a folder `data` containing the fastq files to be processed, a folder `pipeline` containing the released pipelines and a folder `fastq_wiper` with the source files of `FastqWiper`.
-Input files to be processed should be copied into the **data** folder. +Input files to be processed must be copied into the **data** folder. Currently, to run the `FastqWiper` pipelines, the following packages need to be installed manually: ### required packages: -[gzrt](https://github.com/arenn/gzrt) (Linux build fron source [instructions](https://github.com/arenn/gzrt/blob/master/README.build), Ubuntu install [instructions](https://howtoinstall.co/en/gzrt), Mac OS install [instructions](https://formulae.brew.sh/formula/gzrt)) +[gzrt](https://github.com/arenn/gzrt) (Linux build from source [instructions](https://github.com/arenn/gzrt/blob/master/README.build), Ubuntu install [instructions](https://howtoinstall.co/en/gzrt), Mac OS install [instructions](https://formulae.brew.sh/formula/gzrt)) [BBTools](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/) (install [instructions](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/installation-guide/)) @@ -140,19 +142,21 @@ If installed from source, `gzrt` scripts need to be put on PATH. `bbmap` must be ### Commands: Copy the fastq files you want to fix in the `data` folder. -**N.b.**: In all commands above, you will pass to the workflow the name of the sample to be analyzed through the config argument: `sample_name`. Remember that your fastq files' names must finish with `_R1.fastq.gz` and `_R2.fastq.gz`, for paired fastq files, and with `.fastq.gz`, for individual fastq files, and, therefore, the text to be assigned to the variable `sample_name` must be everything before them. E.g., if your files are `my_sample_R1.fastq.gz` and `my_sample_R2.fastq.gz`, then `--config sample_name=my_sample`. + +**N.b.**: In all commands above, you will pass the name of the sample to be analyzed to the workflow through the config argument: `sample_name`. Remember that your fastq files' names must finish with `_R1.fastq.gz` and `_R2.fastq.gz`, for paired fastq files, and with `.fastq.gz`, for individual fastq files, and, therefore, the text to be assigned to the variable `sample_name` must be everything before them. E.g., if your files are `my_sample_R1.fastq.gz` and `my_sample_R2.fastq.gz`, then `--config sample_name=my_sample`. + #### Paired-end files - **Get a dry run** of a pipeline (e.g., `fix_wipe_pairs_reads_sequential.smk`):
-`snakemake --config sample_name=my_sample qin=33 -s pipeline/fix_wipe_pairs_reads_sequential.smk --use-conda --cores 4` +`snakemake --config sample_name=my_sample qin=33 alphabet=ACGTN -s pipeline/fix_wipe_pairs_reads_sequential.smk --use-conda --cores 4` - **Generate the planned DAG**:
-`snakemake --config sample_name=my_sample qin=33 -s pipeline/fix_wipe_pairs_reads_sequential.smk --dag | dot -Tpdf > dag.pdf`

+`snakemake --config sample_name=my_sample qin=33 alphabet=ACGTN -s pipeline/fix_wipe_pairs_reads_sequential.smk --dag | dot -Tpdf > dag.pdf`

- **Run the pipeline** (n.b., during the first execution, Snakemake will download and install some required remote packages and may take longer). The number of computing cores can be tuned accordingly:
-`snakemake --config sample_name=my_sample -s pipeline/fix_wipe_single_reads_sequential.smk --use-conda --cores 2` +`snakemake --config sample_name=my_sample alphabet=ACGTN -s pipeline/fix_wipe_single_reads_sequential.smk --use-conda --cores 2` Fixed files will be copied in the `data` folder and will be suffixed with the string `_fixed_wiped_paired_interleaving`. We remind that the `fix_wipe_pairs_reads_sequential.smk` and `fix_wipe_pairs_reads_parallel.smk` pipelines perform the following actions: @@ -165,14 +169,14 @@ We remind that the `fix_wipe_pairs_reads_sequential.smk` and `fix_wipe_pairs_rea `fix_wipe_single_reads_parallel.smk` and `fix_wipe_single_reads_sequential.smk` will not execute `trimmomatic` and BBmap's `repair.sh`. - **Get a dry run** of a pipeline (e.g., `fix_wipe_single_reads_sequential.smk`):
-`snakemake --config sample_name=my_sample -s pipeline/fix_wipe_single_reads_sequential.smk --use-conda --cores 2 -np` +`snakemake --config sample_name=my_sample alphabet=ACGTN -s pipeline/fix_wipe_single_reads_sequential.smk --use-conda --cores 2 -np` - **Generate the planned DAG**:
-`snakemake --config sample_name=my_sample -s pipeline/fix_wipe_single_reads_sequential.smk --dag | dot -Tpdf > dag.pdf`

+`snakemake --config sample_name=my_sample alphabet=ACGTN -s pipeline/fix_wipe_single_reads_sequential.smk --dag | dot -Tpdf > dag.pdf`

- **Run the pipeline** (n.b., The number of computing cores can be tuned accordingly):
-`snakemake --config sample_name=my_sample -s pipeline/fix_wipe_single_reads_sequential.smk --use-conda --cores 2` +`snakemake --config sample_name=my_sample alphabet=ACGTN -s pipeline/fix_wipe_single_reads_sequential.smk --use-conda --cores 2` # Author **Tommaso Mazza** diff --git a/Singularity.def b/Singularity.def index 046e747..a6822e2 100644 --- a/Singularity.def +++ b/Singularity.def @@ -37,9 +37,9 @@ From: condaforge/mambaforge chmod 777 /fastqwiper/run_wiping.sh %runscript - if [ $# -eq 5 ] || [ $# -eq 3 ] || [ $# -eq 0 ]; then + if [ $# -eq 7 ] || [ $# -eq 3 ] || [ $# -eq 0 ]; then exec /fastqwiper/run_wiping.sh $@ else - echo "You must provide three + 2 optional arguments [computing mode ('paired' or 'single'), # of cores (int), sample name (string), chunk size (optional, int), ASCII offset (optional, 33 or 64)]" + echo "You must provide three + 4 optional arguments [computing mode ('paired' or 'single'), # of cores (int), sample name (string), chunk size (optional, int), ASCII offset (optional, 33 or 64), allowed SEQ alphabet (optional, e.g., ACGTN), log frequency (optional, "500000")]" exit 1 fi diff --git a/fastq_wiper/wiper.py b/fastq_wiper/wiper.py index 3242fbf..ac16c6b 100644 --- a/fastq_wiper/wiper.py +++ b/fastq_wiper/wiper.py @@ -303,8 +303,10 @@ def wipe_fastq(fastq_in: str, fastq_out: str, log_out: str, log_frequency: int): parser = argparse.ArgumentParser(description='FastqWiper entrypoint') parser.add_argument("-i", '--fastq_in', help='The corrupted FASTQ file', required=True) parser.add_argument("-o", '--fastq_out', help='The wiped FASTQ file', required=True) - parser.add_argument("-l", '--log_out', nargs='?', help='The file name of the final quality report summary') - parser.add_argument("-f", '--log_frequency', type=int, nargs='?', default=500000, const=500000, help='The number of reads you want to print a status message') + parser.add_argument("-l", '--log_out', nargs='?', + help='The file name of the final quality report summary. Print on the screen if not specified') + parser.add_argument("-f", '--log_frequency', type=int, nargs='?', default=500000, const=500000, + help='The number of reads you want to print a status message. Default: 500000') parser.add_argument("-a", '--alphabet', type=str, nargs='?', default="ACGTN", const="ACGTN", help='Allowed character in the SEQ line. Default: ACGTN') args = parser.parse_args() diff --git a/pipeline/fix_wipe_pairs_reads_parallel.smk b/pipeline/fix_wipe_pairs_reads_parallel.smk index 4719d7a..d3bb30e 100644 --- a/pipeline/fix_wipe_pairs_reads_parallel.smk +++ b/pipeline/fix_wipe_pairs_reads_parallel.smk @@ -6,6 +6,8 @@ from snakemake.io import expand, temp SAMPLE=config["sample_name"] QIN=config["qin"] +ALPHABET=config["alphabet"] +LOG_FREQ=config["log_freq"] rule all: input: @@ -48,7 +50,7 @@ rule wipe_fastq_parallel: message: "Running FastqWiper on {input}." shell:''' - fastqwiper --fastq_in {input} --fastq_out {output} --log_out data/{wildcards.sample}_chunks/{wildcards.sample}_final_summary.txt --log_frequency 300 2> {log} + fastqwiper --fastq_in {input} --fastq_out {output} --log_out data/{wildcards.sample}_chunks/{wildcards.sample}_final_summary.txt --log_frequency {LOG_FREQ} --alphabet {ALPHABET} 2> {log} ''' def aggregate_input(wildcards): diff --git a/pipeline/fix_wipe_pairs_reads_sequential.smk b/pipeline/fix_wipe_pairs_reads_sequential.smk index f740194..081b35e 100644 --- a/pipeline/fix_wipe_pairs_reads_sequential.smk +++ b/pipeline/fix_wipe_pairs_reads_sequential.smk @@ -4,6 +4,8 @@ from snakemake.io import expand, temp SAMPLES=config["sample_name"] QIN=config["qin"] +ALPHABET=config["alphabet"] +LOG_FREQ=config["log_freq"] rule all: input: @@ -31,7 +33,7 @@ rule wipe_fastq: message: "Running FastqWiper on {input}." shell:''' - fastqwiper --fastq_in {input} --fastq_out {output} --log_out data/{wildcards.sample}_final_summary.txt 2> {log} + fastqwiper --fastq_in {input} --fastq_out {output} --log_out data/{wildcards.sample}_final_summary.txt --log_frequency {LOG_FREQ} --alphabet {ALPHABET} 2> {log} ''' rule drop_unpaired: diff --git a/pipeline/fix_wipe_single_reads_parallel.smk b/pipeline/fix_wipe_single_reads_parallel.smk index 982309c..4721127 100644 --- a/pipeline/fix_wipe_single_reads_parallel.smk +++ b/pipeline/fix_wipe_single_reads_parallel.smk @@ -4,6 +4,8 @@ import os import shutil SAMPLE=config["sample_name"] +ALPHABET=config["alphabet"] +LOG_FREQ=config["log_freq"] rule all: input: @@ -49,7 +51,7 @@ rule wipe_fastq_parallel: message: "Running FastqWiper on {input}." shell:''' - fastqwiper --fastq_in {input} --fastq_out {output} --log_out data/{wildcards.sample}_chunks/{wildcards.sample}_final_summary.txt 2> {log} + fastqwiper --fastq_in {input} --fastq_out {output} --log_out data/{wildcards.sample}_chunks/{wildcards.sample}_final_summary.txt --alphabet {ALPHABET} --log_frequency {LOG_FREQ} 2> {log} ''' diff --git a/pipeline/fix_wipe_single_reads_sequential.smk b/pipeline/fix_wipe_single_reads_sequential.smk index 2f11688..c9cdf7f 100644 --- a/pipeline/fix_wipe_single_reads_sequential.smk +++ b/pipeline/fix_wipe_single_reads_sequential.smk @@ -1,6 +1,8 @@ # cmd: snakemake -s fix_wipe_single_reads.smk --use-conda --cores 2 SAMPLES=config["sample_name"] +ALPHABET=config["alphabet"] +LOG_FREQ=config["log_freq"] rule all: input: @@ -29,7 +31,5 @@ rule wipe_fastq: message: "Running FastqWiper on {input}." shell:''' - fastqwiper --fastq_in {input} --fastq_out {output} --log_out data/{wildcards.sample}_final_summary.txt 2> {log} + fastqwiper --fastq_in {input} --fastq_out {output} --log_out data/{wildcards.sample}_final_summary.txt --alphabet {ALPHABET} --log_frequency {LOG_FREQ} 2> {log} ''' - - diff --git a/run_wiping.sh b/run_wiping.sh index 2ac24a3..2aeab65 100644 --- a/run_wiping.sh +++ b/run_wiping.sh @@ -6,9 +6,11 @@ if [ $# -eq 0 ]; then sample_name="sample" chunk_size=50000000 qin=33 + alphabet="ACGTN" + log_freq=500000 echo "Running with custom arguments: " "$@" -elif [ $# -ge 3 ] && [ $# -le 5 ]; then +elif [ $# -ge 3 ] && [ $# -le 7 ]; then mode=$1 cores=$(($2)) sample_name=$3 @@ -25,26 +27,39 @@ elif [ $# -ge 3 ] && [ $# -le 5 ]; then qin=33 fi + # Optional + alphabet=$(($6)) + if [ "$alphabet" -eq "0" ]; then + alphabet="ACGTN" + fi + + # Optional + log_freq=$(($7)) + if [ "$log_freq" -eq "0" ]; then + log_freq=500000 + fi + + if [ "$mode" == "paired" ]; then if [ "$cores" -gt 1 ]; then echo "Processing paired-end files in parallel" - snakemake --config sample_name=$sample_name chunk_size=$chunk_size qin=$qin -s ./pipeline/fix_wipe_pairs_reads_parallel.smk --use-conda --cores $cores + snakemake --config sample_name=$sample_name chunk_size=$chunk_size qin=$qin alphabet=$alphabet log_freq=$log_freq -s ./pipeline/fix_wipe_pairs_reads_parallel.smk --use-conda --cores $cores else echo "Processing paired-end files sequentially" - snakemake --config sample_name=$sample_name qin=$qin -s ./pipeline/fix_wipe_pairs_reads_sequential.smk --use-conda --cores $cores + snakemake --config sample_name=$sample_name qin=$qin alphabet=$alphabet log_freq=$log_freq -s ./pipeline/fix_wipe_pairs_reads_sequential.smk --use-conda --cores $cores fi elif [ "$mode" == "single" ]; then if [ "$cores" -gt 1 ]; then echo "Processing single-end file in parallel" - snakemake --config sample_name=$sample_name chunk_size=$chunk_size -s ./pipeline/fix_wipe_single_reads_parallel.smk --use-conda --cores $cores + snakemake --config sample_name=$sample_name chunk_size=$chunk_size alphabet=$alphabet log_freq=$log_freq -s ./pipeline/fix_wipe_single_reads_parallel.smk --use-conda --cores $cores else echo "Processing single-end file sequentially" - snakemake --config sample_name=$sample_name -s ./pipeline/fix_wipe_single_reads_sequential.smk --use-conda --cores $cores + snakemake --config sample_name=$sample_name alphabet=$alphabet log_freq=$log_freq -s ./pipeline/fix_wipe_single_reads_sequential.smk --use-conda --cores $cores fi else echo "Allowed computing modes are: 'paired' or 'single'" fi else - echo "You must provide three + 2 optional arguments [computing mode ('paired' or 'single'), # of cores (int), sample name (string), chunk size (optional, int), ASCII offset (optional, 33 or 64)]" + echo "You must provide three + 4 optional arguments [computing mode ('paired' or 'single'), # of cores (int), sample name (string), chunk size (optional, int), ASCII offset (optional, 33 or 64), allowed SEQ alphabet (optional, e.g., ACGTN), log frequency (optional, 500000)]" exit 1 fi