Merge pull request #71 from RIVM-bioinformatics/dev

multi-ref support
RIVM-bioinformatics · Aug 8, 2023 · 129d13e · 129d13e
2 parents cb850a9 + b2e1c38
commit 129d13e
Show file tree

Hide file tree

Showing 5 changed files with 91 additions and 62 deletions.
diff --git a/AmpliGone/AmpliGone.py b/AmpliGone/AmpliGone.py
@@ -347,7 +347,10 @@ def main():
     removed_coordinates = tuple(chain(*ProcessedReads["Removed_coordinates"]))
 
     log.info(
-        f"\tRemoved a total of [bold cyan]{len(removed_coordinates)}[/bold cyan] nucleotides. This is [bold cyan]{round((len(removed_coordinates)/total_nuc_preprocessing)*100, 2)}%[/bold cyan] of the total amount of nucleotides present in the original reads."
+        f"\tRemoved a total of [bold cyan]{len(removed_coordinates)}[/bold cyan] nucleotides."
+    )
+    log.info(
+        f"\tThis is [bold cyan]{round((len(removed_coordinates)/total_nuc_preprocessing)*100, 2)}%[/bold cyan] of the total amount of nucleotides present in the original reads."
     )
     log.info(
         f"\tThese nucleotides were removed from [bold cyan]{len(set(removed_coordinates))}[/bold cyan] unique nucleotide-coordinates."

diff --git a/AmpliGone/cut_reads.py b/AmpliGone/cut_reads.py
@@ -1,3 +1,5 @@
+from collections import defaultdict
+
 import mappy as mp
 import pandas as pd
 
@@ -70,16 +72,24 @@ def CutReads(
     workers,
 ):
     Frame, _threadnumber = data
-    RVSet = set()
-    FWSet = set()
-    for _, start, end, strand in primer_df[["start", "end", "strand"]].itertuples():
+
+    RVDict = defaultdict(set)
+    FWDict = defaultdict(set)
+
+    reference_ids = set(primer_df["ref"].unique())
+    for refid in reference_ids:
+        RVDict[refid] = set()
+        FWDict[refid] = set()
+
+    for _, refid, start, end, strand in primer_df[
+        ["ref", "start", "end", "strand"]
+    ].itertuples():
+
         for coord in range(start + 1, end):  # +1 because reference is 1-based
             if strand == "+":
-                FWSet.add(coord)
+                FWDict[refid].add(coord)
             elif strand == "-":
-                RVSet.add(coord)
-    FWList = tuple(FWSet)  # Since tuples are hashable
-    RVList = tuple(RVSet)
+                RVDict[refid].add(coord)
 
     Aln = mp.Aligner(
         reference,
@@ -127,6 +137,14 @@ def CutReads(
 
                 previous_seq = seq
 
+                # Fetch the primer coordinates that correspond to the reference that the read maps to
+                # we're using tuples here because they are hashable
+                FWTuple = tuple(FWDict[hit.ctg])
+                RVTuple = tuple(RVDict[hit.ctg])
+
+                if not FWTuple or not RVTuple:
+                    print(FWTuple, RVTuple, hit.ctg)
+
                 qstart = hit.q_st
                 qend = hit.q_en
 
@@ -139,7 +157,7 @@ def CutReads(
                         seq,
                         qual,
                         PositionNeedsCutting=PositionInOrBeforePrimer,
-                        primer_list=FWList,
+                        primer_list=FWTuple,
                         position_on_reference=hit.r_st,
                         cut_direction=1,
                         read_direction=hit.strand,
@@ -159,7 +177,7 @@ def CutReads(
                         seq,
                         qual,
                         PositionNeedsCutting=PositionInOrAfterPrimer,
-                        primer_list=RVList,
+                        primer_list=RVTuple,
                         position_on_reference=hit.r_en,
                         cut_direction=-1,
                         read_direction=hit.strand,

diff --git a/AmpliGone/cutlery.py b/AmpliGone/cutlery.py
@@ -22,7 +22,7 @@ def PositionInOrBeforePrimer(pos, clist, max_lookaround):
 
     """
     d = lambda x: abs(x - pos)
-    near = min(clist, key=d)
+    near = min(clist, key=d, default=0)
     return abs(pos - near) < max_lookaround and pos <= near
 
 
@@ -47,7 +47,7 @@ def PositionInOrAfterPrimer(pos, clist, max_lookaround):
 
     """
     d = lambda x: abs(x - pos)
-    near = min(clist, key=d)
+    near = min(clist, key=d, default=0)
     return abs(pos - near) < max_lookaround and pos >= near
 
 

diff --git a/AmpliGone/fasta2bed.py b/AmpliGone/fasta2bed.py
@@ -91,52 +91,59 @@ def CoordListGen(primerfile, referencefile, err_rate=0.1):
 
     primers = list(SeqIO.parse(primerfile, "fasta"))
 
-    ref_file = SeqIO.read(referencefile, "fasta")
-    ref_seq = str(ref_file.seq)
-
-    for primer in primers:
-        seq = str(primer.seq)
-        revcomp = Seq.reverse_complement(seq)
-
-        coords = get_coords(seq, ref_seq, err_rate)
-        rev_coords = get_coords(revcomp, ref_seq, err_rate)
-        if coords and rev_coords:
-            log.warning(
-                f"Primer [yellow underline]{primer.id}[/yellow underline] found on both forward and reverse strand of '[yellow]{ref_file.name}[/yellow]'.\n[yellow bold]Check to see if this is intended.[/yellow bold]"
-            )
-        if not coords and not rev_coords:
-            log.warning(
-                f"Skipping [yellow underline]{primer.id}[/yellow underline] as it is found on neither forward or reverse strand of [yellow underline]{ref_file.name}[/yellow underline].\n[yellow bold]Check to see if this is intended.[/yellow bold]"
-            )
-            continue
-        if coords and len(coords) > 1:
-            log.warning(
-                f"Primer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]forward strand[/underline]: \n\t[yellow]{coords}\n[bold]Check to see if this is intended.[/yellow][/bold]"
-            )
-        if rev_coords and len(rev_coords) > 1:
-            log.warning(
-                f"Primer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]reverse strand[/underline]: {rev_coords}.\nCheck to see if this is intended."
-            )
-        for start, end in coords.union(rev_coords):
-            if any(o in primer.id for o in keyl):
-                strand = "+"
-            elif any(o in primer.id for o in keyr):
-                strand = "-"
-            else:
-                log.error(
-                    f"Primer name {primer.id} does not contain orientation (e.g. {primer.id}_RIGHT). Consider suffixing with {keyl + keyr}"
+    ref_file = list(SeqIO.parse(referencefile, "fasta"))
+    ref_seq = [str(ref.seq) for ref in ref_file]
+    ref_id = [ref.id for ref in ref_file]
+
+    # The loop in a loop here is not a particularly efficient way of doing this.
+    # But this is the easiest implementation for now, and it's not like this is a particularly
+    # cpu or time intensive process anyway.
+    # Might come back to this when there's more time to create a better solution.
+    for ref_seq, ref_id in zip(ref_seq, ref_id):
+        log.info(f"Searching for primers in reference-id: [yellow]{ref_id}[/yellow]")
+        for primer in primers:
+            seq = str(primer.seq)
+            revcomp = Seq.reverse_complement(seq)
+
+            coords = get_coords(seq, ref_seq, err_rate)
+            rev_coords = get_coords(revcomp, ref_seq, err_rate)
+            if coords and rev_coords:
+                log.warning(
+                    f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on both [underline]forward[/underline] and [underline]reverse strand[/underline] of [yellow underline]{ref_id}[/yellow underline].\n\t[yellow bold]Check to see if this is intended.[/yellow bold]"
+                )
+            if not coords and not rev_coords:
+                log.warning(
+                    f"\tSkipping [yellow underline]{primer.id}[/yellow underline] as it is found on neither [underline]forward[/underline] or [underline]reverse strand[/underline] of [yellow underline]{ref_id}[/yellow underline].\n\t[yellow bold]Check to see if this is intended.[/yellow bold]"
+                )
+                continue
+            if coords and len(coords) > 1:
+                log.warning(
+                    f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]forward strand[/underline] of [yellow underline]{ref_id}[/yellow underline]: \n\t[yellow]{coords}\n\t[bold]Check to see if this is intended.[/yellow][/bold]"
+                )
+            if rev_coords and len(rev_coords) > 1:
+                log.warning(
+                    f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]reverse strand[/underline] of [yellow underline]{ref_id}[/yellow underline]: \n\t[yellow]{rev_coords}\n\t[bold]Check to see if this is intended.[/yellow][/bold]"
+                )
+            for start, end in coords.union(rev_coords):
+                if any(o in primer.id for o in keyl):
+                    strand = "+"
+                elif any(o in primer.id for o in keyr):
+                    strand = "-"
+                else:
+                    log.error(
+                        f"Primer name {primer.id} does not contain orientation (e.g. {primer.id}_RIGHT). Consider suffixing with {keyl + keyr}"
+                    )
+                    continue
+                yield dict(
+                    ref=ref_id,
+                    start=start,
+                    end=end,
+                    name=primer.id,
+                    score=".",
+                    strand=strand,
+                    seq=seq,
+                    revcomp=revcomp,
                 )
-                exit(1)
-            yield dict(
-                ref=ref_file.name,
-                start=start,
-                end=end,
-                name=primer.id,
-                score=".",
-                strand=strand,
-                seq=seq,
-                revcomp=revcomp,
-            )
 
 
 def CoordinateListsToBed(df, outfile):

diff --git a/CITATION.cff b/CITATION.cff
@@ -13,13 +13,14 @@ authors:
     affiliation: >-
       National Institute for Public Health and the
       Environment (RIVM)
-    orcid: https://orcid.org/0009-0002-6384-3446
+    orcid: 'https://orcid.org/0009-0002-6384-3446'
   - name: "The RIVM-IDS Bioinformatics team"
 version: 1.2.1 #x-release-please-version
-# identifiers:
-#   - type: doi
-#     value: none
-#     description: This is the collection of archived snapshots of all versions of AmpliGone
+doi: 10.5281/zenodo.7684307
+identifiers:
+  - type: doi
+    value: 10.5281/zenodo.7684307
+    description: This is the collection of archived snapshots of all versions of AmpliGone
 repository-code: 'https://github.com/RIVM-bioinformatics/AmpliGone'
 url: >-
   https://rivm-bioinformatics.github.io/AmpliGone/latest/