artic-network · rmcolq · Dec 9, 2024 · Nov 26, 2024 · Nov 27, 2024 · Nov 27, 2024
diff --git a/bin/assignment.py b/bin/assignment.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 import sys
 from collections import defaultdict
 
@@ -17,81 +19,233 @@ def trim_read_id(read_id):
 
     return read_id
 
+def get_mrca(taxon_id1, taxon_id2, parents):
+    if taxon_id1 == taxon_id2:
+        return taxon_id1
+    elif taxon_id1 == "0" or taxon_id2 == "0":
+        return "0"
+
+    ancestry1 = []
+    current = taxon_id1
+    while current in parents and current != "1":
+        ancestry1.append(current)
+        current = parents[current]
+    ancestry1.append(current)
+    ancestry1.reverse()
+
+    ancestry2 = []
+    current = taxon_id2
+    while current in parents and current != "1":
+        ancestry2.append(current)
+        current = parents[current]
+    ancestry2.append(current)
+    ancestry2.reverse()
+
+    index = min(len(ancestry1), len(ancestry2)) - 1
+    while index > 0 and ancestry1[index] != ancestry2[index]:
+        index -= 1
+    return ancestry1[index]
+
+
+class KrakenAssignmentEntry:
+    """
+    A class representing a line in a kraken assignment file.
 
-def parse_kraken_assignment_line(line):
+    Attributes:
+        classified (str): C if read classified, U if unclassified
+        read_id (str): The read name
+        taxon_id (str): The NCBI taxon identifier
+        length (int): Length of read in bp
+        kmer_string (str): space separated string representing the taxon_ids matched along the read
     """
-    Parses the read_id and taxon_id from a line in the kraken assignment file.
+    def __init__(self, line=None):
+        """
+        Initializes an KrakenAssignmentEntry object.
 
-    Parameters:
-        line (str): A line from kraken assignment file.
+        Parameters:
+            row (str): A row from a kraken file
+        """
+        self.classified = "U"
+        self.read_id = ""
+        self.taxon_id = "0"
+        self.length = 0
+        self.kmer_string = ""
+        if line is not None:
+            self.add_line(line)
+    def __eq__(self, other):
+        if isinstance(other, self.__class__):
+            return self.__dict__ == other.__dict__
+        else:
+            return False
+
+    def add_line(self, line):
+        """
+        Parses the line in the kraken assignment file.
 
-    Returns:
-        taxon_id (str): The NCBI taxon identifier.
-        read_id (str): trimmed read identifier.
-    """
-    line_vals = line.strip().split("\t")
-    if len(line_vals) < 5:
-        return -1, ""
-    if "taxid" in line_vals[2]:
-        temp = line_vals[2].split("taxid ")[-1]
-        taxon_id = temp[:-1]
-    else:
-        taxon_id = line_vals[2]
+        Parameters:
+            line (str): A line from kraken assignment file.
 
-    read_id = trim_read_id(line_vals[1])
+        """
+        num_fields = len(line.split("\t"))
+        if num_fields != 5:
+            sys.stderr.write(
+                f"Kraken assignment line {line} badly formatted - must have 5 fields"
+            )
+            sys.exit(11)
+        self.classified, self.read_id, self.taxon_id, length, self.kmer_string = line.strip().split("\t")
+        self.length = int(length)
 
-    if taxon_id == "A":
-        taxon_id = "81077"
-    else:
-        taxon_id = taxon_id
-    return taxon_id, read_id
+        #if "taxid" in self.taxon_id:
+        #    temp = self.taxon_id.split("taxid ")[-1]
+        #    self.taxon_id = temp[:-1] // can't remember where this came from so leave it out
+
+        self.read_id = trim_read_id(self.read_id)
+
+        if self.taxon_id == "A":
+            self.taxon_id = "81077"
+
+    def declassify(self):
+        """
+            Changes the classified status of KrakenAssignmentEntry to unclassified
+        """
+        self.classified = "U"
+        self.taxon_id = "0"
+
+    def get_line(self):
+        """
+            Get string representation of KrakenAssignmentEntry
+        """
+        fields = [self.classified, self.read_id, self.taxon_id, str(self.length), self.kmer_string]
+        return "\t".join(fields)
+
+    def print(self):
+        """
+        Print the attributes of KrakenAssignmentEntry as a string
+        """
+        print(f"{self.get_line()}")
 
 
 class KrakenAssignments:
     """
     A class representing a kraken assignment file.
 
     Attributes:
-        file (str): Name of file to parse.
+        file_name (str): Name of file to parse.
+        load (bool): If set loads the contents of the file into memory
     """
-    def __init__(self, assignment_file):
+    def __init__(self, assignment_file, load=False):
         """
         Initializes an KrakenAssignments object.
         """
-        self.file = assignment_file
+        self.entries = defaultdict(KrakenAssignmentEntry)
+        self.file_name = assignment_file
 
-    def parse_kraken_assignment_file(self, taxon_id_map, parents=None):
+        if load:
+            self.load_file()
+
+    def __eq__(self, other):
+        if isinstance(other, self.__class__):
+            return self.__dict__ == other.__dict__
+        else:
+            return False
+
+    def get_read_map(self, taxon_id_map, parents=None):
         """
         Parses the kraken assignment file and collects the read_ids associated with each of the
-        required taxon ids.
+        required taxon ids. If paired reads are provided, will consider the common ancestor of
+        the 2 assignments
 
         Parameters:
-            taxon_id_map (dict): A dict from a taxon id to a list of related taxon_ids.
+            taxon_id_map (iter): Iterable of taxon ids to identify reads for.
             parents (dict): A dict mapping taxon id to parent taxon id from NCBI Taxonomy
 
         Returns:
-            read_map (dict): A dict from read_id to a set of values from the taxon_id_map if the
-                             read_id was classified as the taxon_id.
+            read_map (dict): A dict from read_id to a taxon_id in the input iterable.
         """
-        read_map = defaultdict(set)
-        with open(self.file, "r") as kfile:
+        read_map = defaultdict(str)
+        extended_map = defaultdict(str)
+        comments = set()
+        with open(self.file_name, "r") as kfile:
             for line in kfile:
-                taxon_id, read_id = parse_kraken_assignment_line(line)
-                if taxon_id in taxon_id_map:
-                    if read_id in read_map and taxon_id != read_map[read_id]:
+                assignment = KrakenAssignmentEntry(line)
+                taxon_id, read_id = assignment.taxon_id, assignment.read_id
+
+                corrected_taxon_id = taxon_id
+                if parents:
+                    while corrected_taxon_id in parents and corrected_taxon_id not in taxon_id_map and corrected_taxon_id != "1":
+                        corrected_taxon_id = parents[corrected_taxon_id]
+                        if corrected_taxon_id in taxon_id_map and corrected_taxon_id!= taxon_id:
+                            comments.add(f"Assign {taxon_id} to {corrected_taxon_id} list")
+
+                if read_id in extended_map:
+                    mrca_taxon_id = get_mrca(corrected_taxon_id, extended_map[read_id], parents)
+                    if mrca_taxon_id in taxon_id_map:
+                        if mrca_taxon_id != read_map[read_id]:
+                            comments.add(f"Reassign {extended_map[read_id]} (and {corrected_taxon_id}) to mrca {mrca_taxon_id} list")
+                            read_map[read_id] = mrca_taxon_id
+                    elif read_id in read_map:
+                        comments.add(f"MRCA {mrca_taxon_id} of {extended_map[read_id]} and {corrected_taxon_id} not in taxon_id_map")
                         del read_map[read_id]
-                    else:
-                        read_map[read_id].add(taxon_id)
-                elif parents:
-                    # handle case where taxon_id has changed
-                    current = taxon_id
-                    while current in parents and current not in taxon_id_map and current != "1":
-                        current = parents[current]
-                        if current in taxon_id_map:
-                            print(f"Add {taxon_id} to {current} list")
-                            if read_id in read_map and current != read_map[read_id]:
-                                del read_map[read_id]
-                            else:
-                                read_map[read_id].add(current)
+
+                elif corrected_taxon_id in taxon_id_map:
+                    if corrected_taxon_id != taxon_id:
+                        comments.add(f"Assign {taxon_id} to {corrected_taxon_id} list")
+                    read_map[read_id] = corrected_taxon_id
+                extended_map[read_id] = taxon_id
+        for c in comments:
+            print(c)
         return read_map
 
+    def load_file(self, taxon_ids=None):
+        """
+        Loads all entries in the kraken assignment file. If this is a paired file and there is a clash, result is unclassified
+
+        Parameters:
+            taxon_ids (iterable): A subset of taxon_ids to retain assignment lines from.
+        """
+        with open(self.file_name, "r") as kfile:
+            for line in kfile:
+                assignment = KrakenAssignmentEntry(line)
+                if (taxon_ids and assignment.taxon_id in taxon_ids) or not taxon_ids:
+                    if assignment.read_id in self.entries and assignment.taxon_id != self.entries[assignment.read_id].taxon_id:
+                        self.entries[assignment.read_id].declassify()
+                    else:
+                        self.entries[assignment.read_id] = assignment
+
+    def update(self, new_assignments, changes=None):
+        """
+        Updates read assignments using new file with preference
+
+        Parameters:
+            new_assignments (KrakenAssignments): A new loaded KrakenAssignments object.
+        """
+        if not changes:
+            changes = defaultdict(lambda : defaultdict(int))
+
+        if len(self.entries) == 0:
+            self.entries = new_assignments.entries
+            return
+
+        for read_id, entry in new_assignments.entries.items():
+            if read_id not in self.entries:
+                self.entries[read_id] = entry
+                changes["0"][entry.taxon_id] += 1
+
+            elif (read_id in self.entries
+                  and entry.classified == "C"
+                  and entry.taxon_id != self.entries[read_id].taxon_id):
+                old_taxon_id = self.entries[read_id].taxon_id
+                new_taxon_id = entry.taxon_id
+                self.entries[read_id] = entry
+                changes[old_taxon_id][new_taxon_id] += 1
+
+        return changes
+
+    def save(self):
+        """
+            Save the KrakenAssignments object in kraken assignment format
+        """
+        with open(self.file_name, "w") as out:
+            for taxon_id, entry in self.entries.items():
+                out.write(f"{entry.get_line()}\n")
+
diff --git a/bin/extract_fraction_from_reads.py b/bin/extract_fraction_from_reads.py
@@ -122,7 +122,7 @@ def main():
 
     # Initialize kraken assignment file
     kraken_assignment = KrakenAssignments(args.kraken_assignment_file)
-    read_map = kraken_assignment.parse_kraken_assignment_file(taxon_id_map)
+    read_map = kraken_assignment.get_read_map(taxon_id_map)
 
     out_counts = extract_reads(
         read_map,

diff --git a/bin/extract_taxa_from_reads.py b/bin/extract_taxa_from_reads.py
@@ -34,6 +34,11 @@ def get_taxon_id_lists(
     include_parents=False,
     include_children=False
 ):
+    """
+    Loops through the kraken report, and for each taxon_id in the report, if it meets the thresholds, a key is added to
+    lists_to_extract. The values in this dictionary are all taxon_ids which should be added to the file alongside the
+    key taxon_id (e.g. parents or children)
+    """
     lists_to_extract = defaultdict(set)
     for taxon in kraken_report.entries:
         entry = kraken_report.entries[taxon]
@@ -43,7 +48,7 @@ def get_taxon_id_lists(
             continue
         if min_count_descendants and entry.count < min_count_descendants:
             continue
-        if min_percent and kraken_report.get_percentage(taxon) < min_percent:
+        if min_percent and kraken_report.get_percentage(taxon, denominator="classified") < min_percent:
             continue
         if len(names) > 0 and entry.name not in names and taxon not in names:
             continue
@@ -100,16 +105,16 @@ def extract_taxa(
     # open read files
     filetype, zipped = check_read_files(reads1)
 
+    # This inverts the lists_to_extract to identify for an assigned taxon_id, which taxon_id files it should be added to.
     subtaxa_map = defaultdict(set)
-
     for taxon, subtaxons in lists_to_extract.items():
         for subtaxa in subtaxons:
             subtaxa_map[subtaxa].add(taxon)
         # sys.stderr.write(
         #    "INCLUDING PARENTS/CHILDREN, HAVE %i TAXA TO INCLUDE IN READ FILES for %s\n"
         #    % (len(lists_to_extract[taxon]), taxon)
         # )
-    read_map = kraken_assignment.parse_kraken_assignment_file(subtaxa_map)
+    read_map = kraken_assignment.get_read_map(subtaxa_map)
 
     prefixes = setup_prefixes(lists_to_extract, prefix)
     out_counts, quals, lens, filenames = process_read_files(
@@ -210,7 +215,7 @@ def main():
         dest="min_percent",
         required=False,
         type=float,
-        help="Minimum percentage of reads e.g 4",
+        help="Minimum percentage of classified reads e.g 4",
     )
     parser.add_argument(
         "--n",
@@ -275,7 +280,8 @@ def main():
     # Load kraken report entries
     sys.stderr.write("Loading kraken report\n")
     kraken_report = KrakenReport(args.report_file)
-    kraken_report.check_host({9606:args.max_human})
+    if args.max_human:
+        kraken_report.check_host({"9606":args.max_human})
 
     # Initialize kraken assignment file
     sys.stderr.write("Loading kraken assignments\n")

diff --git a/bin/extract_utils.py b/bin/extract_utils.py
@@ -229,7 +229,8 @@ def file_iterator(read_file, read_map, subtaxa_map, inverse, out_counts, quals,
         trimmed_name = trim_read_id(name)
         if inverse:
             if trimmed_name in reads_of_interest:
-                for taxon in read_map[trimmed_name]:
+                taxon_id = read_map[trimmed_name]
+                for taxon in subtaxa_map[taxon_id]:
                     update_summary_with_record(taxon, record, out_counts, quals, lens)
                 continue
             else:
@@ -239,10 +240,10 @@ def file_iterator(read_file, read_map, subtaxa_map, inverse, out_counts, quals,
         elif not inverse:
             if trimmed_name not in reads_of_interest:
                 continue
-            for k2_taxon in read_map[trimmed_name]:
-                for taxon in subtaxa_map[k2_taxon]:
-                    count += 1
-                    add_record(taxon, record, out_counts, quals, lens, filenames, file_index, out_handles=out_handles, out_records=out_records)
+            taxon_id = read_map[trimmed_name]
+            for taxon in subtaxa_map[taxon_id]:
+                count += 1
+                add_record(taxon, record, out_counts, quals, lens, filenames, file_index, out_handles=out_handles, out_records=out_records)
 
     if not low_memory:
         save_records_to_file(out_records, filenames, file_index)