Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Viral reclassification #75 #81

Open
wants to merge 5 commits into
base: main_unstable
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
250 changes: 202 additions & 48 deletions bin/assignment.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/env python

import sys
from collections import defaultdict

Expand All @@ -17,81 +19,233 @@ def trim_read_id(read_id):

return read_id

def get_mrca(taxon_id1, taxon_id2, parents):
if taxon_id1 == taxon_id2:
return taxon_id1
elif taxon_id1 == "0" or taxon_id2 == "0":
return "0"

ancestry1 = []
current = taxon_id1
while current in parents and current != "1":
ancestry1.append(current)
current = parents[current]
ancestry1.append(current)
ancestry1.reverse()

ancestry2 = []
current = taxon_id2
while current in parents and current != "1":
ancestry2.append(current)
current = parents[current]
ancestry2.append(current)
ancestry2.reverse()

index = min(len(ancestry1), len(ancestry2)) - 1
while index > 0 and ancestry1[index] != ancestry2[index]:
index -= 1
return ancestry1[index]


class KrakenAssignmentEntry:
"""
A class representing a line in a kraken assignment file.

def parse_kraken_assignment_line(line):
Attributes:
classified (str): C if read classified, U if unclassified
read_id (str): The read name
taxon_id (str): The NCBI taxon identifier
length (int): Length of read in bp
kmer_string (str): space separated string representing the taxon_ids matched along the read
"""
Parses the read_id and taxon_id from a line in the kraken assignment file.
def __init__(self, line=None):
"""
Initializes an KrakenAssignmentEntry object.

Parameters:
line (str): A line from kraken assignment file.
Parameters:
row (str): A row from a kraken file
"""
self.classified = "U"
self.read_id = ""
self.taxon_id = "0"
self.length = 0
self.kmer_string = ""
if line is not None:
self.add_line(line)
def __eq__(self, other):
if isinstance(other, self.__class__):
return self.__dict__ == other.__dict__
else:
return False

def add_line(self, line):
"""
Parses the line in the kraken assignment file.

Returns:
taxon_id (str): The NCBI taxon identifier.
read_id (str): trimmed read identifier.
"""
line_vals = line.strip().split("\t")
if len(line_vals) < 5:
return -1, ""
if "taxid" in line_vals[2]:
temp = line_vals[2].split("taxid ")[-1]
taxon_id = temp[:-1]
else:
taxon_id = line_vals[2]
Parameters:
line (str): A line from kraken assignment file.

read_id = trim_read_id(line_vals[1])
"""
num_fields = len(line.split("\t"))
if num_fields != 5:
sys.stderr.write(
f"Kraken assignment line {line} badly formatted - must have 5 fields"
)
sys.exit(11)
self.classified, self.read_id, self.taxon_id, length, self.kmer_string = line.strip().split("\t")
self.length = int(length)

if taxon_id == "A":
taxon_id = "81077"
else:
taxon_id = taxon_id
return taxon_id, read_id
#if "taxid" in self.taxon_id:
# temp = self.taxon_id.split("taxid ")[-1]
# self.taxon_id = temp[:-1] // can't remember where this came from so leave it out

self.read_id = trim_read_id(self.read_id)

if self.taxon_id == "A":
self.taxon_id = "81077"

def declassify(self):
"""
Changes the classified status of KrakenAssignmentEntry to unclassified
"""
self.classified = "U"
self.taxon_id = "0"

def get_line(self):
"""
Get string representation of KrakenAssignmentEntry
"""
fields = [self.classified, self.read_id, self.taxon_id, str(self.length), self.kmer_string]
return "\t".join(fields)

def print(self):
"""
Print the attributes of KrakenAssignmentEntry as a string
"""
print(f"{self.get_line()}")


class KrakenAssignments:
"""
A class representing a kraken assignment file.

Attributes:
file (str): Name of file to parse.
file_name (str): Name of file to parse.
load (bool): If set loads the contents of the file into memory
"""
def __init__(self, assignment_file):
def __init__(self, assignment_file, load=False):
"""
Initializes an KrakenAssignments object.
"""
self.file = assignment_file
self.entries = defaultdict(KrakenAssignmentEntry)
self.file_name = assignment_file

def parse_kraken_assignment_file(self, taxon_id_map, parents=None):
if load:
self.load_file()

def __eq__(self, other):
if isinstance(other, self.__class__):
return self.__dict__ == other.__dict__
else:
return False

def get_read_map(self, taxon_id_map, parents=None):
"""
Parses the kraken assignment file and collects the read_ids associated with each of the
required taxon ids.
required taxon ids. If paired reads are provided, will consider the common ancestor of
the 2 assignments

Parameters:
taxon_id_map (dict): A dict from a taxon id to a list of related taxon_ids.
taxon_id_map (iter): Iterable of taxon ids to identify reads for.
parents (dict): A dict mapping taxon id to parent taxon id from NCBI Taxonomy

Returns:
read_map (dict): A dict from read_id to a set of values from the taxon_id_map if the
read_id was classified as the taxon_id.
read_map (dict): A dict from read_id to a taxon_id in the input iterable.
"""
read_map = defaultdict(set)
with open(self.file, "r") as kfile:
read_map = defaultdict(str)
extended_map = defaultdict(str)
comments = set()
with open(self.file_name, "r") as kfile:
for line in kfile:
taxon_id, read_id = parse_kraken_assignment_line(line)
if taxon_id in taxon_id_map:
if read_id in read_map and taxon_id != read_map[read_id]:
assignment = KrakenAssignmentEntry(line)
taxon_id, read_id = assignment.taxon_id, assignment.read_id

corrected_taxon_id = taxon_id
if parents:
while corrected_taxon_id in parents and corrected_taxon_id not in taxon_id_map and corrected_taxon_id != "1":
corrected_taxon_id = parents[corrected_taxon_id]
if corrected_taxon_id in taxon_id_map and corrected_taxon_id!= taxon_id:
comments.add(f"Assign {taxon_id} to {corrected_taxon_id} list")

if read_id in extended_map:
mrca_taxon_id = get_mrca(corrected_taxon_id, extended_map[read_id], parents)
if mrca_taxon_id in taxon_id_map:
if mrca_taxon_id != read_map[read_id]:
comments.add(f"Reassign {extended_map[read_id]} (and {corrected_taxon_id}) to mrca {mrca_taxon_id} list")
read_map[read_id] = mrca_taxon_id
elif read_id in read_map:
comments.add(f"MRCA {mrca_taxon_id} of {extended_map[read_id]} and {corrected_taxon_id} not in taxon_id_map")
del read_map[read_id]
else:
read_map[read_id].add(taxon_id)
elif parents:
# handle case where taxon_id has changed
current = taxon_id
while current in parents and current not in taxon_id_map and current != "1":
current = parents[current]
if current in taxon_id_map:
print(f"Add {taxon_id} to {current} list")
if read_id in read_map and current != read_map[read_id]:
del read_map[read_id]
else:
read_map[read_id].add(current)

elif corrected_taxon_id in taxon_id_map:
if corrected_taxon_id != taxon_id:
comments.add(f"Assign {taxon_id} to {corrected_taxon_id} list")
read_map[read_id] = corrected_taxon_id
extended_map[read_id] = taxon_id
for c in comments:
print(c)
return read_map

def load_file(self, taxon_ids=None):
"""
Loads all entries in the kraken assignment file. If this is a paired file and there is a clash, result is unclassified

Parameters:
taxon_ids (iterable): A subset of taxon_ids to retain assignment lines from.
"""
with open(self.file_name, "r") as kfile:
for line in kfile:
assignment = KrakenAssignmentEntry(line)
if (taxon_ids and assignment.taxon_id in taxon_ids) or not taxon_ids:
if assignment.read_id in self.entries and assignment.taxon_id != self.entries[assignment.read_id].taxon_id:
self.entries[assignment.read_id].declassify()
else:
self.entries[assignment.read_id] = assignment

def update(self, new_assignments, changes=None):
"""
Updates read assignments using new file with preference

Parameters:
new_assignments (KrakenAssignments): A new loaded KrakenAssignments object.
"""
if not changes:
changes = defaultdict(lambda : defaultdict(int))

if len(self.entries) == 0:
self.entries = new_assignments.entries
return

for read_id, entry in new_assignments.entries.items():
if read_id not in self.entries:
self.entries[read_id] = entry
changes["0"][entry.taxon_id] += 1

elif (read_id in self.entries
and entry.classified == "C"
and entry.taxon_id != self.entries[read_id].taxon_id):
old_taxon_id = self.entries[read_id].taxon_id
new_taxon_id = entry.taxon_id
self.entries[read_id] = entry
changes[old_taxon_id][new_taxon_id] += 1

return changes

def save(self):
"""
Save the KrakenAssignments object in kraken assignment format
"""
with open(self.file_name, "w") as out:
for taxon_id, entry in self.entries.items():
out.write(f"{entry.get_line()}\n")

2 changes: 1 addition & 1 deletion bin/extract_fraction_from_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def main():

# Initialize kraken assignment file
kraken_assignment = KrakenAssignments(args.kraken_assignment_file)
read_map = kraken_assignment.parse_kraken_assignment_file(taxon_id_map)
read_map = kraken_assignment.get_read_map(taxon_id_map)

out_counts = extract_reads(
read_map,
Expand Down
16 changes: 11 additions & 5 deletions bin/extract_taxa_from_reads.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ def get_taxon_id_lists(
include_parents=False,
include_children=False
):
"""
Loops through the kraken report, and for each taxon_id in the report, if it meets the thresholds, a key is added to
lists_to_extract. The values in this dictionary are all taxon_ids which should be added to the file alongside the
key taxon_id (e.g. parents or children)
"""
lists_to_extract = defaultdict(set)
for taxon in kraken_report.entries:
entry = kraken_report.entries[taxon]
Expand All @@ -43,7 +48,7 @@ def get_taxon_id_lists(
continue
if min_count_descendants and entry.count < min_count_descendants:
continue
if min_percent and kraken_report.get_percentage(taxon) < min_percent:
if min_percent and kraken_report.get_percentage(taxon, denominator="classified") < min_percent:
continue
if len(names) > 0 and entry.name not in names and taxon not in names:
continue
Expand Down Expand Up @@ -100,16 +105,16 @@ def extract_taxa(
# open read files
filetype, zipped = check_read_files(reads1)

# This inverts the lists_to_extract to identify for an assigned taxon_id, which taxon_id files it should be added to.
subtaxa_map = defaultdict(set)

for taxon, subtaxons in lists_to_extract.items():
for subtaxa in subtaxons:
subtaxa_map[subtaxa].add(taxon)
# sys.stderr.write(
# "INCLUDING PARENTS/CHILDREN, HAVE %i TAXA TO INCLUDE IN READ FILES for %s\n"
# % (len(lists_to_extract[taxon]), taxon)
# )
read_map = kraken_assignment.parse_kraken_assignment_file(subtaxa_map)
read_map = kraken_assignment.get_read_map(subtaxa_map)

prefixes = setup_prefixes(lists_to_extract, prefix)
out_counts, quals, lens, filenames = process_read_files(
Expand Down Expand Up @@ -210,7 +215,7 @@ def main():
dest="min_percent",
required=False,
type=float,
help="Minimum percentage of reads e.g 4",
help="Minimum percentage of classified reads e.g 4",
)
parser.add_argument(
"--n",
Expand Down Expand Up @@ -275,7 +280,8 @@ def main():
# Load kraken report entries
sys.stderr.write("Loading kraken report\n")
kraken_report = KrakenReport(args.report_file)
kraken_report.check_host({9606:args.max_human})
if args.max_human:
kraken_report.check_host({"9606":args.max_human})

# Initialize kraken assignment file
sys.stderr.write("Loading kraken assignments\n")
Expand Down
11 changes: 6 additions & 5 deletions bin/extract_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,8 @@ def file_iterator(read_file, read_map, subtaxa_map, inverse, out_counts, quals,
trimmed_name = trim_read_id(name)
if inverse:
if trimmed_name in reads_of_interest:
for taxon in read_map[trimmed_name]:
taxon_id = read_map[trimmed_name]
for taxon in subtaxa_map[taxon_id]:
update_summary_with_record(taxon, record, out_counts, quals, lens)
continue
else:
Expand All @@ -239,10 +240,10 @@ def file_iterator(read_file, read_map, subtaxa_map, inverse, out_counts, quals,
elif not inverse:
if trimmed_name not in reads_of_interest:
continue
for k2_taxon in read_map[trimmed_name]:
for taxon in subtaxa_map[k2_taxon]:
count += 1
add_record(taxon, record, out_counts, quals, lens, filenames, file_index, out_handles=out_handles, out_records=out_records)
taxon_id = read_map[trimmed_name]
for taxon in subtaxa_map[taxon_id]:
count += 1
add_record(taxon, record, out_counts, quals, lens, filenames, file_index, out_handles=out_handles, out_records=out_records)

if not low_memory:
save_records_to_file(out_records, filenames, file_index)
Expand Down
Loading