Skip to content

Commit

Permalink
Merge pull request #292 from althonos/dev
Browse files Browse the repository at this point in the history
Use a multithreaded pool to run Pyrodigal in parallel
  • Loading branch information
gbouras13 authored Sep 14, 2023
2 parents 30fce5f + fc3ecf0 commit c7137a0
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 19 deletions.
4 changes: 2 additions & 2 deletions bin/pharokka.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,12 +283,12 @@ def main():
run_phanotate(input_fasta, out_dir, logdir)
elif gene_predictor == "prodigal":
logger.info("Implementing Prodigal using Pyrodigal.")
run_pyrodigal(input_fasta, out_dir, args.meta, args.coding_table)
run_pyrodigal(input_fasta, out_dir, args.meta, args.coding_table, int(args.threads))
elif gene_predictor == "genbank":
logger.info("Extracting CDS information from your genbank file.")
elif gene_predictor == "prodigal-gv":
logger.info("Implementing Prodigal-gv using Pyrodigal-gv.")
run_pyrodigal_gv(input_fasta, out_dir)
run_pyrodigal_gv(input_fasta, out_dir, int(args.threads))

# translate fastas (parse genbank)
translate_fastas(out_dir, gene_predictor, args.coding_table, args.infile)
Expand Down
45 changes: 28 additions & 17 deletions bin/processes.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import subprocess as sp
import multiprocessing.pool
from datetime import datetime

import pandas as pd
Expand All @@ -14,7 +15,7 @@
from util import remove_directory


def run_pyrodigal_gv(filepath_in, out_dir):
def run_pyrodigal_gv(filepath_in, out_dir, threads):
"""
Gets CDS using pyrodigal_gv
:param filepath_in: input filepath
Expand All @@ -28,12 +29,17 @@ def run_pyrodigal_gv(filepath_in, out_dir):
# true
orf_finder = pyrodigal_gv.ViralGeneFinder(meta=True)

with open(os.path.join(out_dir, "prodigal-gv_out.gff"), "w") as dst:
with open(os.path.join(out_dir, "prodigal-gv_out_tmp.fasta"), "w") as gff:
for i, record in enumerate(SeqIO.parse(filepath_in, "fasta")):
genes = orf_finder.find_genes(str(record.seq))
genes.write_gff(dst, sequence_id=record.id, include_translation_table=True)
genes.write_genes(gff, sequence_id=record.id)
def _find_genes(record):
genes = orf_finder.find_genes(str(record.seq))
return (record.id, genes)

with multiprocessing.pool.ThreadPool(threads) as pool:
with open(os.path.join(out_dir, "prodigal-gv_out.gff"), "w") as dst:
with open(os.path.join(out_dir, "prodigal-gv_out_tmp.fasta"), "w") as gff:
records = SeqIO.parse(filepath_in, "fasta")
for record_id, genes in pool.imap(_find_genes, records):
genes.write_gff(dst, sequence_id=record_id, include_translation_table=True)
genes.write_genes(gff, sequence_id=record_id)

##### phanotate meta mode ########

Expand Down Expand Up @@ -277,14 +283,15 @@ def run_phanotate(filepath_in, out_dir, logdir):
logger.error("Error with Phanotate\n")


def run_pyrodigal(filepath_in, out_dir, meta, coding_table):
def run_pyrodigal(filepath_in, out_dir, meta, coding_table, threads):
"""
Gets CDS using pyrodigal
:param filepath_in: input filepath
:param out_dir: output directory
:param logger logger
:param meta Boolean - metagenomic mode flag
:param coding_table coding table for prodigal (default 11)
:param threads: threads
:return:
"""

Expand All @@ -300,15 +307,19 @@ def run_pyrodigal(filepath_in, out_dir, meta, coding_table):

# coding table possible if false
if prodigal_metamode == False:
trainings_info = orf_finder.train(*seqs, translation_table=int(coding_table))
orf_finder = pyrodigal.GeneFinder(trainings_info, meta=prodigal_metamode)

with open(os.path.join(out_dir, "prodigal_out.gff"), "w") as dst:
with open(os.path.join(out_dir, "prodigal_out_tmp.fasta"), "w") as gff:
for i, record in enumerate(SeqIO.parse(filepath_in, "fasta")):
genes = orf_finder.find_genes(str(record.seq))
genes.write_gff(dst, sequence_id=record.id)
genes.write_genes(gff, sequence_id=record.id)
orf_finder.train(*seqs, translation_table=int(coding_table))

def _find_genes(record):
genes = orf_finder.find_genes(str(record.seq))
return (record.id, genes)

with multiprocessing.pool.ThreadPool(threads) as pool:
with open(os.path.join(out_dir, "prodigal_out.gff"), "w") as dst:
with open(os.path.join(out_dir, "prodigal_out_tmp.fasta"), "w") as gff:
records = SeqIO.parse(filepath_in, "fasta")
for record_id, genes in pool.imap(_find_genes, records):
genes.write_gff(dst, sequence_id=record_id)
genes.write_genes(gff, sequence_id=record_id)


def tidy_phanotate_output(out_dir):
Expand Down

0 comments on commit c7137a0

Please sign in to comment.