diff --git a/AmpliGone/AmpliGone.py b/AmpliGone/AmpliGone.py index e89cb6f..41e38e0 100644 --- a/AmpliGone/AmpliGone.py +++ b/AmpliGone/AmpliGone.py @@ -347,7 +347,10 @@ def main(): removed_coordinates = tuple(chain(*ProcessedReads["Removed_coordinates"])) log.info( - f"\tRemoved a total of [bold cyan]{len(removed_coordinates)}[/bold cyan] nucleotides. This is [bold cyan]{round((len(removed_coordinates)/total_nuc_preprocessing)*100, 2)}%[/bold cyan] of the total amount of nucleotides present in the original reads." + f"\tRemoved a total of [bold cyan]{len(removed_coordinates)}[/bold cyan] nucleotides." + ) + log.info( + f"\tThis is [bold cyan]{round((len(removed_coordinates)/total_nuc_preprocessing)*100, 2)}%[/bold cyan] of the total amount of nucleotides present in the original reads." ) log.info( f"\tThese nucleotides were removed from [bold cyan]{len(set(removed_coordinates))}[/bold cyan] unique nucleotide-coordinates." diff --git a/AmpliGone/cut_reads.py b/AmpliGone/cut_reads.py index 518a064..58c8e30 100644 --- a/AmpliGone/cut_reads.py +++ b/AmpliGone/cut_reads.py @@ -1,3 +1,5 @@ +from collections import defaultdict + import mappy as mp import pandas as pd @@ -70,16 +72,24 @@ def CutReads( workers, ): Frame, _threadnumber = data - RVSet = set() - FWSet = set() - for _, start, end, strand in primer_df[["start", "end", "strand"]].itertuples(): + + RVDict = defaultdict(set) + FWDict = defaultdict(set) + + reference_ids = set(primer_df["ref"].unique()) + for refid in reference_ids: + RVDict[refid] = set() + FWDict[refid] = set() + + for _, refid, start, end, strand in primer_df[ + ["ref", "start", "end", "strand"] + ].itertuples(): + for coord in range(start + 1, end): # +1 because reference is 1-based if strand == "+": - FWSet.add(coord) + FWDict[refid].add(coord) elif strand == "-": - RVSet.add(coord) - FWList = tuple(FWSet) # Since tuples are hashable - RVList = tuple(RVSet) + RVDict[refid].add(coord) Aln = mp.Aligner( reference, @@ -127,6 +137,14 @@ def CutReads( previous_seq = seq + # Fetch the primer coordinates that correspond to the reference that the read maps to + # we're using tuples here because they are hashable + FWTuple = tuple(FWDict[hit.ctg]) + RVTuple = tuple(RVDict[hit.ctg]) + + if not FWTuple or not RVTuple: + print(FWTuple, RVTuple, hit.ctg) + qstart = hit.q_st qend = hit.q_en @@ -139,7 +157,7 @@ def CutReads( seq, qual, PositionNeedsCutting=PositionInOrBeforePrimer, - primer_list=FWList, + primer_list=FWTuple, position_on_reference=hit.r_st, cut_direction=1, read_direction=hit.strand, @@ -159,7 +177,7 @@ def CutReads( seq, qual, PositionNeedsCutting=PositionInOrAfterPrimer, - primer_list=RVList, + primer_list=RVTuple, position_on_reference=hit.r_en, cut_direction=-1, read_direction=hit.strand, diff --git a/AmpliGone/cutlery.py b/AmpliGone/cutlery.py index bd158df..2e19002 100644 --- a/AmpliGone/cutlery.py +++ b/AmpliGone/cutlery.py @@ -22,7 +22,7 @@ def PositionInOrBeforePrimer(pos, clist, max_lookaround): """ d = lambda x: abs(x - pos) - near = min(clist, key=d) + near = min(clist, key=d, default=0) return abs(pos - near) < max_lookaround and pos <= near @@ -47,7 +47,7 @@ def PositionInOrAfterPrimer(pos, clist, max_lookaround): """ d = lambda x: abs(x - pos) - near = min(clist, key=d) + near = min(clist, key=d, default=0) return abs(pos - near) < max_lookaround and pos >= near diff --git a/AmpliGone/fasta2bed.py b/AmpliGone/fasta2bed.py index b146753..1e5e529 100644 --- a/AmpliGone/fasta2bed.py +++ b/AmpliGone/fasta2bed.py @@ -91,52 +91,59 @@ def CoordListGen(primerfile, referencefile, err_rate=0.1): primers = list(SeqIO.parse(primerfile, "fasta")) - ref_file = SeqIO.read(referencefile, "fasta") - ref_seq = str(ref_file.seq) - - for primer in primers: - seq = str(primer.seq) - revcomp = Seq.reverse_complement(seq) - - coords = get_coords(seq, ref_seq, err_rate) - rev_coords = get_coords(revcomp, ref_seq, err_rate) - if coords and rev_coords: - log.warning( - f"Primer [yellow underline]{primer.id}[/yellow underline] found on both forward and reverse strand of '[yellow]{ref_file.name}[/yellow]'.\n[yellow bold]Check to see if this is intended.[/yellow bold]" - ) - if not coords and not rev_coords: - log.warning( - f"Skipping [yellow underline]{primer.id}[/yellow underline] as it is found on neither forward or reverse strand of [yellow underline]{ref_file.name}[/yellow underline].\n[yellow bold]Check to see if this is intended.[/yellow bold]" - ) - continue - if coords and len(coords) > 1: - log.warning( - f"Primer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]forward strand[/underline]: \n\t[yellow]{coords}\n[bold]Check to see if this is intended.[/yellow][/bold]" - ) - if rev_coords and len(rev_coords) > 1: - log.warning( - f"Primer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]reverse strand[/underline]: {rev_coords}.\nCheck to see if this is intended." - ) - for start, end in coords.union(rev_coords): - if any(o in primer.id for o in keyl): - strand = "+" - elif any(o in primer.id for o in keyr): - strand = "-" - else: - log.error( - f"Primer name {primer.id} does not contain orientation (e.g. {primer.id}_RIGHT). Consider suffixing with {keyl + keyr}" + ref_file = list(SeqIO.parse(referencefile, "fasta")) + ref_seq = [str(ref.seq) for ref in ref_file] + ref_id = [ref.id for ref in ref_file] + + # The loop in a loop here is not a particularly efficient way of doing this. + # But this is the easiest implementation for now, and it's not like this is a particularly + # cpu or time intensive process anyway. + # Might come back to this when there's more time to create a better solution. + for ref_seq, ref_id in zip(ref_seq, ref_id): + log.info(f"Searching for primers in reference-id: [yellow]{ref_id}[/yellow]") + for primer in primers: + seq = str(primer.seq) + revcomp = Seq.reverse_complement(seq) + + coords = get_coords(seq, ref_seq, err_rate) + rev_coords = get_coords(revcomp, ref_seq, err_rate) + if coords and rev_coords: + log.warning( + f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on both [underline]forward[/underline] and [underline]reverse strand[/underline] of [yellow underline]{ref_id}[/yellow underline].\n\t[yellow bold]Check to see if this is intended.[/yellow bold]" + ) + if not coords and not rev_coords: + log.warning( + f"\tSkipping [yellow underline]{primer.id}[/yellow underline] as it is found on neither [underline]forward[/underline] or [underline]reverse strand[/underline] of [yellow underline]{ref_id}[/yellow underline].\n\t[yellow bold]Check to see if this is intended.[/yellow bold]" + ) + continue + if coords and len(coords) > 1: + log.warning( + f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]forward strand[/underline] of [yellow underline]{ref_id}[/yellow underline]: \n\t[yellow]{coords}\n\t[bold]Check to see if this is intended.[/yellow][/bold]" + ) + if rev_coords and len(rev_coords) > 1: + log.warning( + f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]reverse strand[/underline] of [yellow underline]{ref_id}[/yellow underline]: \n\t[yellow]{rev_coords}\n\t[bold]Check to see if this is intended.[/yellow][/bold]" + ) + for start, end in coords.union(rev_coords): + if any(o in primer.id for o in keyl): + strand = "+" + elif any(o in primer.id for o in keyr): + strand = "-" + else: + log.error( + f"Primer name {primer.id} does not contain orientation (e.g. {primer.id}_RIGHT). Consider suffixing with {keyl + keyr}" + ) + continue + yield dict( + ref=ref_id, + start=start, + end=end, + name=primer.id, + score=".", + strand=strand, + seq=seq, + revcomp=revcomp, ) - exit(1) - yield dict( - ref=ref_file.name, - start=start, - end=end, - name=primer.id, - score=".", - strand=strand, - seq=seq, - revcomp=revcomp, - ) def CoordinateListsToBed(df, outfile): diff --git a/CITATION.cff b/CITATION.cff index e31bd8a..3fc05af 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -13,13 +13,14 @@ authors: affiliation: >- National Institute for Public Health and the Environment (RIVM) - orcid: https://orcid.org/0009-0002-6384-3446 + orcid: 'https://orcid.org/0009-0002-6384-3446' - name: "The RIVM-IDS Bioinformatics team" version: 1.2.1 #x-release-please-version -# identifiers: -# - type: doi -# value: none -# description: This is the collection of archived snapshots of all versions of AmpliGone +doi: 10.5281/zenodo.7684307 +identifiers: + - type: doi + value: 10.5281/zenodo.7684307 + description: This is the collection of archived snapshots of all versions of AmpliGone repository-code: 'https://github.com/RIVM-bioinformatics/AmpliGone' url: >- https://rivm-bioinformatics.github.io/AmpliGone/latest/