RIVM-bioinformatics · florianzwagemaker · Jul 27, 2023 · Jul 14, 2023 · Jul 27, 2023
diff --git a/AmpliGone/AmpliGone.py b/AmpliGone/AmpliGone.py
@@ -347,7 +347,10 @@ def main():
     removed_coordinates = tuple(chain(*ProcessedReads["Removed_coordinates"]))
 
     log.info(
-        f"\tRemoved a total of [bold cyan]{len(removed_coordinates)}[/bold cyan] nucleotides. This is [bold cyan]{round((len(removed_coordinates)/total_nuc_preprocessing)*100, 2)}%[/bold cyan] of the total amount of nucleotides present in the original reads."
+        f"\tRemoved a total of [bold cyan]{len(removed_coordinates)}[/bold cyan] nucleotides."
+    )
+    log.info(
+        f"\tThis is [bold cyan]{round((len(removed_coordinates)/total_nuc_preprocessing)*100, 2)}%[/bold cyan] of the total amount of nucleotides present in the original reads."
     )
     log.info(
         f"\tThese nucleotides were removed from [bold cyan]{len(set(removed_coordinates))}[/bold cyan] unique nucleotide-coordinates."

diff --git a/AmpliGone/fasta2bed.py b/AmpliGone/fasta2bed.py
@@ -72,71 +72,78 @@
    )


 def CoordListGen(primerfile, referencefile, err_rate=0.1):
    """Takes a fasta file of primers and a fasta file of a reference sequence, and returns a list of
    dictionaries containing the coordinates of the primers in the reference sequence

    Parameters
    ----------
    primerfile
        The file containing the primers.
    referencefile
        The reference file that you want to search for the primers in.
    err_rate
        The maximum number of mismatches allowed between the primer and the reference.

    """
    keyl = ("LEFT", "PLUS", "POSITIVE", "FORWARD")
    keyr = ("RIGHT", "MINUS", "NEGATIVE", "REVERSE")
 
     primers = list(SeqIO.parse(primerfile, "fasta"))
 
-    ref_file = SeqIO.read(referencefile, "fasta")
-    ref_seq = str(ref_file.seq)
-
-    for primer in primers:
-        seq = str(primer.seq)
-        revcomp = Seq.reverse_complement(seq)
-
-        coords = get_coords(seq, ref_seq, err_rate)
-        rev_coords = get_coords(revcomp, ref_seq, err_rate)
-        if coords and rev_coords:
-            log.warning(
-                f"Primer [yellow underline]{primer.id}[/yellow underline] found on both forward and reverse strand of '[yellow]{ref_file.name}[/yellow]'.\n[yellow bold]Check to see if this is intended.[/yellow bold]"
-            )
-        if not coords and not rev_coords:
-            log.warning(
-                f"Skipping [yellow underline]{primer.id}[/yellow underline] as it is found on neither forward or reverse strand of [yellow underline]{ref_file.name}[/yellow underline].\n[yellow bold]Check to see if this is intended.[/yellow bold]"
-            )
-            continue
-        if coords and len(coords) > 1:
-            log.warning(
-                f"Primer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]forward strand[/underline]: \n\t[yellow]{coords}\n[bold]Check to see if this is intended.[/yellow][/bold]"
-            )
-        if rev_coords and len(rev_coords) > 1:
-            log.warning(
-                f"Primer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]reverse strand[/underline]: {rev_coords}.\nCheck to see if this is intended."
-            )
-        for start, end in coords.union(rev_coords):
-            if any(o in primer.id for o in keyl):
-                strand = "+"
-            elif any(o in primer.id for o in keyr):
-                strand = "-"
-            else:
-                log.error(
-                    f"Primer name {primer.id} does not contain orientation (e.g. {primer.id}_RIGHT). Consider suffixing with {keyl + keyr}"
+    ref_file = list(SeqIO.parse(referencefile, "fasta"))
+    ref_seq = [str(ref.seq) for ref in ref_file]
+    ref_id = [ref.id for ref in ref_file]
+
+    # The loop in a loop here is not a particularly efficient way of doing this.
+    # But this is the easiest implementation for now, and it's not like this is a particularly
+    # cpu or time intensive process anyway.
+    # Might come back to this when there's more time to create a better solution.
+    for ref_seq, ref_id in zip(ref_seq, ref_id):
+        log.info(f"Searching for primers in reference-id: [yellow]{ref_id}[/yellow]")
+        for primer in primers:
+            seq = str(primer.seq)
+            revcomp = Seq.reverse_complement(seq)
+
+            coords = get_coords(seq, ref_seq, err_rate)
+            rev_coords = get_coords(revcomp, ref_seq, err_rate)
+            if coords and rev_coords:
+                log.warning(
+                    f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on both forward and reverse strand of [yellow]{ref_id}[/yellow].\n\t[yellow bold]Check to see if this is intended.[/yellow bold]"
+                )
+            if not coords and not rev_coords:
+                log.warning(
+                    f"\tSkipping [yellow underline]{primer.id}[/yellow underline] as it is found on neither forward or reverse strand of [yellow underline]{ref_id}[/yellow underline].\n\t[yellow bold]Check to see if this is intended.[/yellow bold]"
+                )
+                continue
+            if coords and len(coords) > 1:
+                log.warning(
+                    f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]forward strand[/underline] of [yellow underline]{ref_id}[/yellow underline]: \n\t[yellow]{coords}\n\t[bold]Check to see if this is intended.[/yellow][/bold]"
+                )
+            if rev_coords and len(rev_coords) > 1:
+                log.warning(
+                    f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]reverse strand[/underline] of [yellow underline]{ref_id}[/yellow underline]: \n\t[yellow]{rev_coords}\n\t[bold]Check to see if this is intended.[/yellow][/bold]"
+                )
+            for start, end in coords.union(rev_coords):
+                if any(o in primer.id for o in keyl):
+                    strand = "+"
+                elif any(o in primer.id for o in keyr):
+                    strand = "-"
+                else:
+                    log.error(
+                        f"Primer name {primer.id} does not contain orientation (e.g. {primer.id}_RIGHT). Consider suffixing with {keyl + keyr}"
+                    )
+                    continue
+                yield dict(
+                    ref=ref_id,
+                    start=start,
+                    end=end,
+                    name=primer.id,
+                    score=".",
+                    strand=strand,
+                    seq=seq,
+                    revcomp=revcomp,
                 )
-                exit(1)
-            yield dict(
-                ref=ref_file.name,
-                start=start,
-                end=end,
-                name=primer.id,
-                score=".",
-                strand=strand,
-                seq=seq,
-                revcomp=revcomp,
-            )
 
 
 def CoordinateListsToBed(df, outfile):