From b1f4dad802c0049248fda04d90a7487ed8812d03 Mon Sep 17 00:00:00 2001 From: Charles Cowart Date: Sun, 29 Oct 2023 21:23:27 -0700 Subject: [PATCH 1/3] Fixes issue with fastq.gz files having _I1_ etc. Fixes issue with fastq.gz files having _I1_, _R1_, etc. in parts of the file-name other than the expected location. --- sequence_processing_pipeline/FastQCJob.py | 37 +++++++++++++++---- ....fastq.gz => sample1_L001_R1_001.fastq.gz} | 0 ....fastq.gz => sample1_L001_R2_001.fastq.gz} | 0 ....fastq.gz => sample2_L001_R1_001.fastq.gz} | 0 ....fastq.gz => sample2_L001_R2_001.fastq.gz} | 0 ...z => sample1_L001_R1_001.trimmed.fastq.gz} | 0 ...z => sample1_L001_R2_001.trimmed.fastq.gz} | 0 ...z => sample2_L001_R1_001.trimmed.fastq.gz} | 0 ...z => sample2_L001_R2_001.trimmed.fastq.gz} | 0 9 files changed, 29 insertions(+), 8 deletions(-) rename sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/{sample1_R1_.fastq.gz => sample1_L001_R1_001.fastq.gz} (100%) rename sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/{sample1_R2_.fastq.gz => sample1_L001_R2_001.fastq.gz} (100%) rename sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/{sample2_R1_.fastq.gz => sample2_L001_R1_001.fastq.gz} (100%) rename sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/{sample2_R2_.fastq.gz => sample2_L001_R2_001.fastq.gz} (100%) rename sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/{sample1_R1_.trimmed.fastq.gz => sample1_L001_R1_001.trimmed.fastq.gz} (100%) rename sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/{sample1_R2_.trimmed.fastq.gz => sample1_L001_R2_001.trimmed.fastq.gz} (100%) rename sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/{sample2_R1_.trimmed.fastq.gz => sample2_L001_R1_001.trimmed.fastq.gz} (100%) rename sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/{sample2_R2_.trimmed.fastq.gz => sample2_L001_R2_001.trimmed.fastq.gz} (100%) diff --git a/sequence_processing_pipeline/FastQCJob.py b/sequence_processing_pipeline/FastQCJob.py index 80601ba8..33dcbe23 100644 --- a/sequence_processing_pipeline/FastQCJob.py +++ b/sequence_processing_pipeline/FastQCJob.py @@ -1,10 +1,11 @@ from os import listdir, makedirs -from os.path import exists, join, basename +from os.path import basename, exists, join, split from sequence_processing_pipeline.Job import Job from sequence_processing_pipeline.PipelineError import PipelineError from functools import partial from json import dumps import logging +from re import compile class FastQCJob(Job): @@ -94,13 +95,33 @@ def _find_projects(self, path_to_run_id_data_fastq_dir, is_raw_input): 'zero_files' not in x] # break files up into R1, R2, I1, I2 - # assume _R1_ does not occur in the path as well. - r1_only = [x for x in files if '_R1_' in x] - r2_only = [x for x in files if '_R2_' in x] - - # amplicon runs may or may not have an i2. this is okay. - i1_only = [x for x in files if '_I1_' in x] - i2_only = [x for x in files if '_I2_' in x] + # use capturing to handle both raw files as well as trimmed and + # filtered files. We don't need to process the captured string. + i1_files = compile(r"^.*_L\d{3}_I1_\d{3}\.(trimmed\.|filtered" + r"\.|)fastq\.gz$") + i2_files = compile(r"^.*_L\d{3}_I2_\d{3}\.(trimmed\.|filtered" + r"\.|)fastq\.gz$") + r1_files = compile(r"^.*_L\d{3}_R1_\d{3}\.(trimmed\.|filtered" + r"\.|)fastq\.gz$") + r2_files = compile(r"^.*_L\d{3}_R2_\d{3}\.(trimmed\.|filtered" + r"\.|)fastq\.gz$") + + # i1_only, i2_only, r1_only, r2_only = ([] for i in range(4)) + i1_only = [] + i2_only = [] + r1_only = [] + r2_only = [] + + for some_path in files: + _, file_name = split(some_path) + if i1_files.match(file_name) is not None: + i1_only.append(some_path) + elif i2_files.match(file_name) is not None: + i2_only.append(some_path) + elif r1_files.match(file_name) is not None: + r1_only.append(some_path) + elif r2_files.match(file_name) is not None: + r2_only.append(some_path) if not self.is_amplicon and len(i1_only) != len(i2_only): raise PipelineError('counts of I1 and I2 files do not match') diff --git a/sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/sample1_R1_.fastq.gz b/sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/sample1_L001_R1_001.fastq.gz similarity index 100% rename from sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/sample1_R1_.fastq.gz rename to sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/sample1_L001_R1_001.fastq.gz diff --git a/sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/sample1_R2_.fastq.gz b/sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/sample1_L001_R2_001.fastq.gz similarity index 100% rename from sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/sample1_R2_.fastq.gz rename to sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/sample1_L001_R2_001.fastq.gz diff --git a/sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/sample2_R1_.fastq.gz b/sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/sample2_L001_R1_001.fastq.gz similarity index 100% rename from sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/sample2_R1_.fastq.gz rename to sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/sample2_L001_R1_001.fastq.gz diff --git a/sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/sample2_R2_.fastq.gz b/sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/sample2_L001_R2_001.fastq.gz similarity index 100% rename from sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/sample2_R2_.fastq.gz rename to sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/Data/Fastq/project1/sample2_L001_R2_001.fastq.gz diff --git a/sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/sample1_R1_.trimmed.fastq.gz b/sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/sample1_L001_R1_001.trimmed.fastq.gz similarity index 100% rename from sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/sample1_R1_.trimmed.fastq.gz rename to sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/sample1_L001_R1_001.trimmed.fastq.gz diff --git a/sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/sample1_R2_.trimmed.fastq.gz b/sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/sample1_L001_R2_001.trimmed.fastq.gz similarity index 100% rename from sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/sample1_R2_.trimmed.fastq.gz rename to sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/sample1_L001_R2_001.trimmed.fastq.gz diff --git a/sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/sample2_R1_.trimmed.fastq.gz b/sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/sample2_L001_R1_001.trimmed.fastq.gz similarity index 100% rename from sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/sample2_R1_.trimmed.fastq.gz rename to sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/sample2_L001_R1_001.trimmed.fastq.gz diff --git a/sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/sample2_R2_.trimmed.fastq.gz b/sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/sample2_L001_R2_001.trimmed.fastq.gz similarity index 100% rename from sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/sample2_R2_.trimmed.fastq.gz rename to sequence_processing_pipeline/tests/data/211021_A00000_0000_SAMPLE/sample-sequence-directory/project1/filtered_sequences/sample2_L001_R2_001.trimmed.fastq.gz From c9eb8c8bccf6a709f1b784e8263b1620f4661e32 Mon Sep 17 00:00:00 2001 From: Charles Cowart <42684307+charles-cowart@users.noreply.github.com> Date: Mon, 30 Oct 2023 13:56:29 -0700 Subject: [PATCH 2/3] Update sequence_processing_pipeline/FastQCJob.py Co-authored-by: Daniel McDonald --- sequence_processing_pipeline/FastQCJob.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sequence_processing_pipeline/FastQCJob.py b/sequence_processing_pipeline/FastQCJob.py index 33dcbe23..4bc65582 100644 --- a/sequence_processing_pipeline/FastQCJob.py +++ b/sequence_processing_pipeline/FastQCJob.py @@ -106,7 +106,6 @@ def _find_projects(self, path_to_run_id_data_fastq_dir, is_raw_input): r2_files = compile(r"^.*_L\d{3}_R2_\d{3}\.(trimmed\.|filtered" r"\.|)fastq\.gz$") - # i1_only, i2_only, r1_only, r2_only = ([] for i in range(4)) i1_only = [] i2_only = [] r1_only = [] From 602be3c681e120d859770cd02e3830e828b154ea Mon Sep 17 00:00:00 2001 From: Charles Cowart <42684307+charles-cowart@users.noreply.github.com> Date: Mon, 30 Oct 2023 14:02:04 -0700 Subject: [PATCH 3/3] Update sequence_processing_pipeline/FastQCJob.py Co-authored-by: Daniel McDonald --- sequence_processing_pipeline/FastQCJob.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sequence_processing_pipeline/FastQCJob.py b/sequence_processing_pipeline/FastQCJob.py index 4bc65582..fb16fc67 100644 --- a/sequence_processing_pipeline/FastQCJob.py +++ b/sequence_processing_pipeline/FastQCJob.py @@ -121,6 +121,8 @@ def _find_projects(self, path_to_run_id_data_fastq_dir, is_raw_input): r1_only.append(some_path) elif r2_files.match(file_name) is not None: r2_only.append(some_path) + else: + raise ValueError(f"Unable to match: {some_path}") if not self.is_amplicon and len(i1_only) != len(i2_only): raise PipelineError('counts of I1 and I2 files do not match')