From b4a845a4507b92b222fc0780275f410567f6ee06 Mon Sep 17 00:00:00 2001
From: Marco van Zwetselaar <io@zwets.it>
Date: Wed, 20 Oct 2021 02:29:05 +0300
Subject: [PATCH 1/2] Fix exception when scanning small fastq files

---
 kneaddata/utilities.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kneaddata/utilities.py b/kneaddata/utilities.py
index b09b47a..abf10b5 100644
--- a/kneaddata/utilities.py
+++ b/kneaddata/utilities.py
@@ -312,11 +312,10 @@ def check_sequence_identifier_format(file):
     
     #checking first 100 (400/4) lines
     num_seq_to_check=100 
-    num_lines_to_check=400
     
     #Fetching first and last 100 identifier sequences
     first_seq_identifiers_list=get_first_n_seq_identifiers(file,num_seq_to_check)
-    last_seq_identifiers_list=get_last_n_seq_identifiers(file,num_lines_to_check)
+    last_seq_identifiers_list=get_last_n_seq_identifiers(file,num_seq_to_check)
     # Checking first and last 100 seq identifiers for spaces and new Illumina format
     for lines in first_seq_identifiers_list:
         new_format=sequence_identifier_format_conditions(lines)
@@ -336,7 +335,7 @@ def get_last_n_seq_identifiers(file, n):
     last_seq_identifiers=[]
     # Tail to find last lines
     try:
-        process = subprocess.Popen(['tail', '-'+str(n), file], stdout=subprocess.PIPE)
+        process = subprocess.Popen(['tail', '-'+str(4*n), file], stdout=subprocess.PIPE)
     except subprocess.CalledProcessError:
         pass
     for i,line in enumerate(process.stdout.readlines()):
@@ -350,7 +349,8 @@ def get_first_n_seq_identifiers(file,n):
     first_seq_identifiers=[]
     # Getting first nth seq identifier
     while(count<n):
-        lines=next(all_lines)
+        lines=next(all_lines, None)
+        if not lines: break
         first_seq_identifiers.append(lines[0])
         count+=1
     return first_seq_identifiers

From a607d479b83050df18fac35a9745aeab45781f5d Mon Sep 17 00:00:00 2001
From: Marco van Zwetselaar <io@zwets.it>
Date: Wed, 20 Oct 2021 13:19:37 +0300
Subject: [PATCH 2/2] Fix scan logic, scan first and last header

---
 kneaddata/utilities.py | 71 ++++++++++++++----------------------------
 1 file changed, 23 insertions(+), 48 deletions(-)

diff --git a/kneaddata/utilities.py b/kneaddata/utilities.py
index abf10b5..2c3ec1c 100644
--- a/kneaddata/utilities.py
+++ b/kneaddata/utilities.py
@@ -307,56 +307,31 @@ def get_decompressed_file(file, output_folder, temp_file_list, all_input_files):
     return new_file
 
 def check_sequence_identifier_format(file):
-    """ Check the fastq file to see if there are spaces in the identifier
-        and the format of the id to see if this is the new illumina format """ 
-    
-    #checking first 100 (400/4) lines
-    num_seq_to_check=100 
-    
-    #Fetching first and last 100 identifier sequences
-    first_seq_identifiers_list=get_first_n_seq_identifiers(file,num_seq_to_check)
-    last_seq_identifiers_list=get_last_n_seq_identifiers(file,num_seq_to_check)
-    # Checking first and last 100 seq identifiers for spaces and new Illumina format
-    for lines in first_seq_identifiers_list:
-        new_format=sequence_identifier_format_conditions(lines)
-    for lines in last_seq_identifiers_list:
-        new_format=sequence_identifier_format_conditions(lines)
-    return new_format
-  
-def sequence_identifier_format_conditions(identifier_seq):
-    new_format=False
-    if (" " in identifier_seq):
-        new_format=True
-    if not identifier_seq.endswith("/1\n") and not identifier_seq.endswith("/2\n"):
-        new_format=True
-    return new_format
-    
-def get_last_n_seq_identifiers(file, n):
-    last_seq_identifiers=[]
-    # Tail to find last lines
-    try:
-        process = subprocess.Popen(['tail', '-'+str(4*n), file], stdout=subprocess.PIPE)
-    except subprocess.CalledProcessError:
-        pass
-    for i,line in enumerate(process.stdout.readlines()):
-        if (i%4==0):
-            last_seq_identifiers.append(line.decode("utf-8")) 
-    return last_seq_identifiers
-    
-def get_first_n_seq_identifiers(file,n):
-    count=0
-    all_lines=read_file_n_lines(file,4)
-    first_seq_identifiers=[]
-    # Getting first nth seq identifier
-    while(count<n):
-        lines=next(all_lines, None)
-        if not lines: break
-        first_seq_identifiers.append(lines[0])
-        count+=1
-    return first_seq_identifiers
+    """ Check the first and last read in the file and return True if there are
+        spaces in the read identifier or it doesn't end with /1 or /2. """
 
+    is_new_format=lambda s: " " in s or not (s.endswith("/1\n") or s.endswith("/2\n"))
+
+    read_iter=read_file_n_lines(file,4)
+
+    # Collect the first read, exit with False if none
+    read=next(read_iter, None)
+    if not read:
+        return False
+
+    # Return True if the ID of read 1 is new format
+    read_id=read[0]
+    if is_new_format(read_id):
+        return True
+
+    # Otherwise skip to the last read
+    while read:
+        read_id=read[0]
+        read=next(read_iter, None)
+
+    # Return True iff the last read is new format
+    return is_new_format(read_id)
 
-        
 def get_reformatted_identifiers(file, input_index, output_folder, temp_file_list, all_input_files):
     """ Reformat the sequence identifiers in the fastq file writing to a temp file """