From b4a845a4507b92b222fc0780275f410567f6ee06 Mon Sep 17 00:00:00 2001 From: Marco van Zwetselaar Date: Wed, 20 Oct 2021 02:29:05 +0300 Subject: [PATCH 1/2] Fix exception when scanning small fastq files --- kneaddata/utilities.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kneaddata/utilities.py b/kneaddata/utilities.py index b09b47a..abf10b5 100644 --- a/kneaddata/utilities.py +++ b/kneaddata/utilities.py @@ -312,11 +312,10 @@ def check_sequence_identifier_format(file): #checking first 100 (400/4) lines num_seq_to_check=100 - num_lines_to_check=400 #Fetching first and last 100 identifier sequences first_seq_identifiers_list=get_first_n_seq_identifiers(file,num_seq_to_check) - last_seq_identifiers_list=get_last_n_seq_identifiers(file,num_lines_to_check) + last_seq_identifiers_list=get_last_n_seq_identifiers(file,num_seq_to_check) # Checking first and last 100 seq identifiers for spaces and new Illumina format for lines in first_seq_identifiers_list: new_format=sequence_identifier_format_conditions(lines) @@ -336,7 +335,7 @@ def get_last_n_seq_identifiers(file, n): last_seq_identifiers=[] # Tail to find last lines try: - process = subprocess.Popen(['tail', '-'+str(n), file], stdout=subprocess.PIPE) + process = subprocess.Popen(['tail', '-'+str(4*n), file], stdout=subprocess.PIPE) except subprocess.CalledProcessError: pass for i,line in enumerate(process.stdout.readlines()): @@ -350,7 +349,8 @@ def get_first_n_seq_identifiers(file,n): first_seq_identifiers=[] # Getting first nth seq identifier while(count Date: Wed, 20 Oct 2021 13:19:37 +0300 Subject: [PATCH 2/2] Fix scan logic, scan first and last header --- kneaddata/utilities.py | 71 ++++++++++++++---------------------------- 1 file changed, 23 insertions(+), 48 deletions(-) diff --git a/kneaddata/utilities.py b/kneaddata/utilities.py index abf10b5..2c3ec1c 100644 --- a/kneaddata/utilities.py +++ b/kneaddata/utilities.py @@ -307,56 +307,31 @@ def get_decompressed_file(file, output_folder, temp_file_list, all_input_files): return new_file def check_sequence_identifier_format(file): - """ Check the fastq file to see if there are spaces in the identifier - and the format of the id to see if this is the new illumina format """ - - #checking first 100 (400/4) lines - num_seq_to_check=100 - - #Fetching first and last 100 identifier sequences - first_seq_identifiers_list=get_first_n_seq_identifiers(file,num_seq_to_check) - last_seq_identifiers_list=get_last_n_seq_identifiers(file,num_seq_to_check) - # Checking first and last 100 seq identifiers for spaces and new Illumina format - for lines in first_seq_identifiers_list: - new_format=sequence_identifier_format_conditions(lines) - for lines in last_seq_identifiers_list: - new_format=sequence_identifier_format_conditions(lines) - return new_format - -def sequence_identifier_format_conditions(identifier_seq): - new_format=False - if (" " in identifier_seq): - new_format=True - if not identifier_seq.endswith("/1\n") and not identifier_seq.endswith("/2\n"): - new_format=True - return new_format - -def get_last_n_seq_identifiers(file, n): - last_seq_identifiers=[] - # Tail to find last lines - try: - process = subprocess.Popen(['tail', '-'+str(4*n), file], stdout=subprocess.PIPE) - except subprocess.CalledProcessError: - pass - for i,line in enumerate(process.stdout.readlines()): - if (i%4==0): - last_seq_identifiers.append(line.decode("utf-8")) - return last_seq_identifiers - -def get_first_n_seq_identifiers(file,n): - count=0 - all_lines=read_file_n_lines(file,4) - first_seq_identifiers=[] - # Getting first nth seq identifier - while(count