diff --git a/AmpliGone/AmpliGone.py b/AmpliGone/AmpliGone.py index e89cb6f..41e38e0 100644 --- a/AmpliGone/AmpliGone.py +++ b/AmpliGone/AmpliGone.py @@ -347,7 +347,10 @@ def main(): removed_coordinates = tuple(chain(*ProcessedReads["Removed_coordinates"])) log.info( - f"\tRemoved a total of [bold cyan]{len(removed_coordinates)}[/bold cyan] nucleotides. This is [bold cyan]{round((len(removed_coordinates)/total_nuc_preprocessing)*100, 2)}%[/bold cyan] of the total amount of nucleotides present in the original reads." + f"\tRemoved a total of [bold cyan]{len(removed_coordinates)}[/bold cyan] nucleotides." + ) + log.info( + f"\tThis is [bold cyan]{round((len(removed_coordinates)/total_nuc_preprocessing)*100, 2)}%[/bold cyan] of the total amount of nucleotides present in the original reads." ) log.info( f"\tThese nucleotides were removed from [bold cyan]{len(set(removed_coordinates))}[/bold cyan] unique nucleotide-coordinates." diff --git a/AmpliGone/fasta2bed.py b/AmpliGone/fasta2bed.py index b146753..e27dbc3 100644 --- a/AmpliGone/fasta2bed.py +++ b/AmpliGone/fasta2bed.py @@ -91,52 +91,59 @@ def CoordListGen(primerfile, referencefile, err_rate=0.1): primers = list(SeqIO.parse(primerfile, "fasta")) - ref_file = SeqIO.read(referencefile, "fasta") - ref_seq = str(ref_file.seq) - - for primer in primers: - seq = str(primer.seq) - revcomp = Seq.reverse_complement(seq) - - coords = get_coords(seq, ref_seq, err_rate) - rev_coords = get_coords(revcomp, ref_seq, err_rate) - if coords and rev_coords: - log.warning( - f"Primer [yellow underline]{primer.id}[/yellow underline] found on both forward and reverse strand of '[yellow]{ref_file.name}[/yellow]'.\n[yellow bold]Check to see if this is intended.[/yellow bold]" - ) - if not coords and not rev_coords: - log.warning( - f"Skipping [yellow underline]{primer.id}[/yellow underline] as it is found on neither forward or reverse strand of [yellow underline]{ref_file.name}[/yellow underline].\n[yellow bold]Check to see if this is intended.[/yellow bold]" - ) - continue - if coords and len(coords) > 1: - log.warning( - f"Primer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]forward strand[/underline]: \n\t[yellow]{coords}\n[bold]Check to see if this is intended.[/yellow][/bold]" - ) - if rev_coords and len(rev_coords) > 1: - log.warning( - f"Primer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]reverse strand[/underline]: {rev_coords}.\nCheck to see if this is intended." - ) - for start, end in coords.union(rev_coords): - if any(o in primer.id for o in keyl): - strand = "+" - elif any(o in primer.id for o in keyr): - strand = "-" - else: - log.error( - f"Primer name {primer.id} does not contain orientation (e.g. {primer.id}_RIGHT). Consider suffixing with {keyl + keyr}" + ref_file = list(SeqIO.parse(referencefile, "fasta")) + ref_seq = [str(ref.seq) for ref in ref_file] + ref_id = [ref.id for ref in ref_file] + + # The loop in a loop here is not a particularly efficient way of doing this. + # But this is the easiest implementation for now, and it's not like this is a particularly + # cpu or time intensive process anyway. + # Might come back to this when there's more time to create a better solution. + for ref_seq, ref_id in zip(ref_seq, ref_id): + log.info(f"Searching for primers in reference-id: [yellow]{ref_id}[/yellow]") + for primer in primers: + seq = str(primer.seq) + revcomp = Seq.reverse_complement(seq) + + coords = get_coords(seq, ref_seq, err_rate) + rev_coords = get_coords(revcomp, ref_seq, err_rate) + if coords and rev_coords: + log.warning( + f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on both forward and reverse strand of [yellow]{ref_id}[/yellow].\n\t[yellow bold]Check to see if this is intended.[/yellow bold]" + ) + if not coords and not rev_coords: + log.warning( + f"\tSkipping [yellow underline]{primer.id}[/yellow underline] as it is found on neither forward or reverse strand of [yellow underline]{ref_id}[/yellow underline].\n\t[yellow bold]Check to see if this is intended.[/yellow bold]" + ) + continue + if coords and len(coords) > 1: + log.warning( + f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]forward strand[/underline] of [yellow underline]{ref_id}[/yellow underline]: \n\t[yellow]{coords}\n\t[bold]Check to see if this is intended.[/yellow][/bold]" + ) + if rev_coords and len(rev_coords) > 1: + log.warning( + f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]reverse strand[/underline] of [yellow underline]{ref_id}[/yellow underline]: \n\t[yellow]{rev_coords}\n\t[bold]Check to see if this is intended.[/yellow][/bold]" + ) + for start, end in coords.union(rev_coords): + if any(o in primer.id for o in keyl): + strand = "+" + elif any(o in primer.id for o in keyr): + strand = "-" + else: + log.error( + f"Primer name {primer.id} does not contain orientation (e.g. {primer.id}_RIGHT). Consider suffixing with {keyl + keyr}" + ) + continue + yield dict( + ref=ref_id, + start=start, + end=end, + name=primer.id, + score=".", + strand=strand, + seq=seq, + revcomp=revcomp, ) - exit(1) - yield dict( - ref=ref_file.name, - start=start, - end=end, - name=primer.id, - score=".", - strand=strand, - seq=seq, - revcomp=revcomp, - ) def CoordinateListsToBed(df, outfile):