Skip to content

Commit

Permalink
Merge pull request #71 from RIVM-bioinformatics/dev
Browse files Browse the repository at this point in the history
multi-ref support
  • Loading branch information
florianzwagemaker authored Aug 8, 2023
2 parents cb850a9 + b2e1c38 commit 129d13e
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 62 deletions.
5 changes: 4 additions & 1 deletion AmpliGone/AmpliGone.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,10 @@ def main():
removed_coordinates = tuple(chain(*ProcessedReads["Removed_coordinates"]))

log.info(
f"\tRemoved a total of [bold cyan]{len(removed_coordinates)}[/bold cyan] nucleotides. This is [bold cyan]{round((len(removed_coordinates)/total_nuc_preprocessing)*100, 2)}%[/bold cyan] of the total amount of nucleotides present in the original reads."
f"\tRemoved a total of [bold cyan]{len(removed_coordinates)}[/bold cyan] nucleotides."
)
log.info(
f"\tThis is [bold cyan]{round((len(removed_coordinates)/total_nuc_preprocessing)*100, 2)}%[/bold cyan] of the total amount of nucleotides present in the original reads."
)
log.info(
f"\tThese nucleotides were removed from [bold cyan]{len(set(removed_coordinates))}[/bold cyan] unique nucleotide-coordinates."
Expand Down
36 changes: 27 additions & 9 deletions AmpliGone/cut_reads.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from collections import defaultdict

import mappy as mp
import pandas as pd

Expand Down Expand Up @@ -70,16 +72,24 @@ def CutReads(
workers,
):
Frame, _threadnumber = data
RVSet = set()
FWSet = set()
for _, start, end, strand in primer_df[["start", "end", "strand"]].itertuples():

RVDict = defaultdict(set)
FWDict = defaultdict(set)

reference_ids = set(primer_df["ref"].unique())
for refid in reference_ids:
RVDict[refid] = set()
FWDict[refid] = set()

for _, refid, start, end, strand in primer_df[
["ref", "start", "end", "strand"]
].itertuples():

for coord in range(start + 1, end): # +1 because reference is 1-based
if strand == "+":
FWSet.add(coord)
FWDict[refid].add(coord)
elif strand == "-":
RVSet.add(coord)
FWList = tuple(FWSet) # Since tuples are hashable
RVList = tuple(RVSet)
RVDict[refid].add(coord)

Aln = mp.Aligner(
reference,
Expand Down Expand Up @@ -127,6 +137,14 @@ def CutReads(

previous_seq = seq

# Fetch the primer coordinates that correspond to the reference that the read maps to
# we're using tuples here because they are hashable
FWTuple = tuple(FWDict[hit.ctg])
RVTuple = tuple(RVDict[hit.ctg])

if not FWTuple or not RVTuple:
print(FWTuple, RVTuple, hit.ctg)

qstart = hit.q_st
qend = hit.q_en

Expand All @@ -139,7 +157,7 @@ def CutReads(
seq,
qual,
PositionNeedsCutting=PositionInOrBeforePrimer,
primer_list=FWList,
primer_list=FWTuple,
position_on_reference=hit.r_st,
cut_direction=1,
read_direction=hit.strand,
Expand All @@ -159,7 +177,7 @@ def CutReads(
seq,
qual,
PositionNeedsCutting=PositionInOrAfterPrimer,
primer_list=RVList,
primer_list=RVTuple,
position_on_reference=hit.r_en,
cut_direction=-1,
read_direction=hit.strand,
Expand Down
4 changes: 2 additions & 2 deletions AmpliGone/cutlery.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def PositionInOrBeforePrimer(pos, clist, max_lookaround):
"""
d = lambda x: abs(x - pos)
near = min(clist, key=d)
near = min(clist, key=d, default=0)
return abs(pos - near) < max_lookaround and pos <= near


Expand All @@ -47,7 +47,7 @@ def PositionInOrAfterPrimer(pos, clist, max_lookaround):
"""
d = lambda x: abs(x - pos)
near = min(clist, key=d)
near = min(clist, key=d, default=0)
return abs(pos - near) < max_lookaround and pos >= near


Expand Down
97 changes: 52 additions & 45 deletions AmpliGone/fasta2bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,52 +91,59 @@ def CoordListGen(primerfile, referencefile, err_rate=0.1):

primers = list(SeqIO.parse(primerfile, "fasta"))

ref_file = SeqIO.read(referencefile, "fasta")
ref_seq = str(ref_file.seq)

for primer in primers:
seq = str(primer.seq)
revcomp = Seq.reverse_complement(seq)

coords = get_coords(seq, ref_seq, err_rate)
rev_coords = get_coords(revcomp, ref_seq, err_rate)
if coords and rev_coords:
log.warning(
f"Primer [yellow underline]{primer.id}[/yellow underline] found on both forward and reverse strand of '[yellow]{ref_file.name}[/yellow]'.\n[yellow bold]Check to see if this is intended.[/yellow bold]"
)
if not coords and not rev_coords:
log.warning(
f"Skipping [yellow underline]{primer.id}[/yellow underline] as it is found on neither forward or reverse strand of [yellow underline]{ref_file.name}[/yellow underline].\n[yellow bold]Check to see if this is intended.[/yellow bold]"
)
continue
if coords and len(coords) > 1:
log.warning(
f"Primer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]forward strand[/underline]: \n\t[yellow]{coords}\n[bold]Check to see if this is intended.[/yellow][/bold]"
)
if rev_coords and len(rev_coords) > 1:
log.warning(
f"Primer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]reverse strand[/underline]: {rev_coords}.\nCheck to see if this is intended."
)
for start, end in coords.union(rev_coords):
if any(o in primer.id for o in keyl):
strand = "+"
elif any(o in primer.id for o in keyr):
strand = "-"
else:
log.error(
f"Primer name {primer.id} does not contain orientation (e.g. {primer.id}_RIGHT). Consider suffixing with {keyl + keyr}"
ref_file = list(SeqIO.parse(referencefile, "fasta"))
ref_seq = [str(ref.seq) for ref in ref_file]
ref_id = [ref.id for ref in ref_file]

# The loop in a loop here is not a particularly efficient way of doing this.
# But this is the easiest implementation for now, and it's not like this is a particularly
# cpu or time intensive process anyway.
# Might come back to this when there's more time to create a better solution.
for ref_seq, ref_id in zip(ref_seq, ref_id):
log.info(f"Searching for primers in reference-id: [yellow]{ref_id}[/yellow]")
for primer in primers:
seq = str(primer.seq)
revcomp = Seq.reverse_complement(seq)

coords = get_coords(seq, ref_seq, err_rate)
rev_coords = get_coords(revcomp, ref_seq, err_rate)
if coords and rev_coords:
log.warning(
f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on both [underline]forward[/underline] and [underline]reverse strand[/underline] of [yellow underline]{ref_id}[/yellow underline].\n\t[yellow bold]Check to see if this is intended.[/yellow bold]"
)
if not coords and not rev_coords:
log.warning(
f"\tSkipping [yellow underline]{primer.id}[/yellow underline] as it is found on neither [underline]forward[/underline] or [underline]reverse strand[/underline] of [yellow underline]{ref_id}[/yellow underline].\n\t[yellow bold]Check to see if this is intended.[/yellow bold]"
)
continue
if coords and len(coords) > 1:
log.warning(
f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]forward strand[/underline] of [yellow underline]{ref_id}[/yellow underline]: \n\t[yellow]{coords}\n\t[bold]Check to see if this is intended.[/yellow][/bold]"
)
if rev_coords and len(rev_coords) > 1:
log.warning(
f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]reverse strand[/underline] of [yellow underline]{ref_id}[/yellow underline]: \n\t[yellow]{rev_coords}\n\t[bold]Check to see if this is intended.[/yellow][/bold]"
)
for start, end in coords.union(rev_coords):
if any(o in primer.id for o in keyl):
strand = "+"
elif any(o in primer.id for o in keyr):
strand = "-"
else:
log.error(
f"Primer name {primer.id} does not contain orientation (e.g. {primer.id}_RIGHT). Consider suffixing with {keyl + keyr}"
)
continue
yield dict(
ref=ref_id,
start=start,
end=end,
name=primer.id,
score=".",
strand=strand,
seq=seq,
revcomp=revcomp,
)
exit(1)
yield dict(
ref=ref_file.name,
start=start,
end=end,
name=primer.id,
score=".",
strand=strand,
seq=seq,
revcomp=revcomp,
)


def CoordinateListsToBed(df, outfile):
Expand Down
11 changes: 6 additions & 5 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@ authors:
affiliation: >-
National Institute for Public Health and the
Environment (RIVM)
orcid: https://orcid.org/0009-0002-6384-3446
orcid: 'https://orcid.org/0009-0002-6384-3446'
- name: "The RIVM-IDS Bioinformatics team"
version: 1.2.1 #x-release-please-version
# identifiers:
# - type: doi
# value: none
# description: This is the collection of archived snapshots of all versions of AmpliGone
doi: 10.5281/zenodo.7684307
identifiers:
- type: doi
value: 10.5281/zenodo.7684307
description: This is the collection of archived snapshots of all versions of AmpliGone
repository-code: 'https://github.com/RIVM-bioinformatics/AmpliGone'
url: >-
https://rivm-bioinformatics.github.io/AmpliGone/latest/
Expand Down

0 comments on commit 129d13e

Please sign in to comment.