Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support for multi-fasta reference input #70

Merged
merged 2 commits into from
Jul 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion AmpliGone/AmpliGone.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,10 @@ def main():
removed_coordinates = tuple(chain(*ProcessedReads["Removed_coordinates"]))

log.info(
f"\tRemoved a total of [bold cyan]{len(removed_coordinates)}[/bold cyan] nucleotides. This is [bold cyan]{round((len(removed_coordinates)/total_nuc_preprocessing)*100, 2)}%[/bold cyan] of the total amount of nucleotides present in the original reads."
f"\tRemoved a total of [bold cyan]{len(removed_coordinates)}[/bold cyan] nucleotides."
)
log.info(
f"\tThis is [bold cyan]{round((len(removed_coordinates)/total_nuc_preprocessing)*100, 2)}%[/bold cyan] of the total amount of nucleotides present in the original reads."
)
log.info(
f"\tThese nucleotides were removed from [bold cyan]{len(set(removed_coordinates))}[/bold cyan] unique nucleotide-coordinates."
Expand Down
97 changes: 52 additions & 45 deletions AmpliGone/fasta2bed.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,71 +72,78 @@
)


def CoordListGen(primerfile, referencefile, err_rate=0.1):
"""Takes a fasta file of primers and a fasta file of a reference sequence, and returns a list of
dictionaries containing the coordinates of the primers in the reference sequence

Parameters
----------
primerfile
The file containing the primers.
referencefile
The reference file that you want to search for the primers in.
err_rate
The maximum number of mismatches allowed between the primer and the reference.

"""
keyl = ("LEFT", "PLUS", "POSITIVE", "FORWARD")
keyr = ("RIGHT", "MINUS", "NEGATIVE", "REVERSE")

primers = list(SeqIO.parse(primerfile, "fasta"))

ref_file = SeqIO.read(referencefile, "fasta")
ref_seq = str(ref_file.seq)

for primer in primers:
seq = str(primer.seq)
revcomp = Seq.reverse_complement(seq)

coords = get_coords(seq, ref_seq, err_rate)
rev_coords = get_coords(revcomp, ref_seq, err_rate)
if coords and rev_coords:
log.warning(
f"Primer [yellow underline]{primer.id}[/yellow underline] found on both forward and reverse strand of '[yellow]{ref_file.name}[/yellow]'.\n[yellow bold]Check to see if this is intended.[/yellow bold]"
)
if not coords and not rev_coords:
log.warning(
f"Skipping [yellow underline]{primer.id}[/yellow underline] as it is found on neither forward or reverse strand of [yellow underline]{ref_file.name}[/yellow underline].\n[yellow bold]Check to see if this is intended.[/yellow bold]"
)
continue
if coords and len(coords) > 1:
log.warning(
f"Primer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]forward strand[/underline]: \n\t[yellow]{coords}\n[bold]Check to see if this is intended.[/yellow][/bold]"
)
if rev_coords and len(rev_coords) > 1:
log.warning(
f"Primer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]reverse strand[/underline]: {rev_coords}.\nCheck to see if this is intended."
)
for start, end in coords.union(rev_coords):
if any(o in primer.id for o in keyl):
strand = "+"
elif any(o in primer.id for o in keyr):
strand = "-"
else:
log.error(
f"Primer name {primer.id} does not contain orientation (e.g. {primer.id}_RIGHT). Consider suffixing with {keyl + keyr}"
ref_file = list(SeqIO.parse(referencefile, "fasta"))
ref_seq = [str(ref.seq) for ref in ref_file]
ref_id = [ref.id for ref in ref_file]

# The loop in a loop here is not a particularly efficient way of doing this.
# But this is the easiest implementation for now, and it's not like this is a particularly
# cpu or time intensive process anyway.
# Might come back to this when there's more time to create a better solution.
for ref_seq, ref_id in zip(ref_seq, ref_id):
log.info(f"Searching for primers in reference-id: [yellow]{ref_id}[/yellow]")
for primer in primers:
seq = str(primer.seq)
revcomp = Seq.reverse_complement(seq)

coords = get_coords(seq, ref_seq, err_rate)
rev_coords = get_coords(revcomp, ref_seq, err_rate)
if coords and rev_coords:
log.warning(
f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on both forward and reverse strand of [yellow]{ref_id}[/yellow].\n\t[yellow bold]Check to see if this is intended.[/yellow bold]"
)
if not coords and not rev_coords:
log.warning(
f"\tSkipping [yellow underline]{primer.id}[/yellow underline] as it is found on neither forward or reverse strand of [yellow underline]{ref_id}[/yellow underline].\n\t[yellow bold]Check to see if this is intended.[/yellow bold]"
)
continue
if coords and len(coords) > 1:
log.warning(
f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]forward strand[/underline] of [yellow underline]{ref_id}[/yellow underline]: \n\t[yellow]{coords}\n\t[bold]Check to see if this is intended.[/yellow][/bold]"
)
if rev_coords and len(rev_coords) > 1:
log.warning(
f"\tPrimer [yellow underline]{primer.id}[/yellow underline] found on multiple locations on [underline]reverse strand[/underline] of [yellow underline]{ref_id}[/yellow underline]: \n\t[yellow]{rev_coords}\n\t[bold]Check to see if this is intended.[/yellow][/bold]"
)
for start, end in coords.union(rev_coords):
if any(o in primer.id for o in keyl):
strand = "+"
elif any(o in primer.id for o in keyr):
strand = "-"
else:
log.error(
f"Primer name {primer.id} does not contain orientation (e.g. {primer.id}_RIGHT). Consider suffixing with {keyl + keyr}"
)
continue
yield dict(
ref=ref_id,
start=start,
end=end,
name=primer.id,
score=".",
strand=strand,
seq=seq,

Check notice on line 144 in AmpliGone/fasta2bed.py

View check run for this annotation

codefactor.io / CodeFactor

AmpliGone/fasta2bed.py#L75-L144

Complex Method
revcomp=revcomp,
)
exit(1)
yield dict(
ref=ref_file.name,
start=start,
end=end,
name=primer.id,
score=".",
strand=strand,
seq=seq,
revcomp=revcomp,
)


def CoordinateListsToBed(df, outfile):
Expand Down