Skip to content

Commit

Permalink
added cpu support for renaming fasta entries
Browse files Browse the repository at this point in the history
  • Loading branch information
JLSteenwyk committed Sep 18, 2024
1 parent ab8c3f1 commit a4a2e91
Show file tree
Hide file tree
Showing 12 changed files with 182 additions and 77 deletions.
6 changes: 4 additions & 2 deletions docs/usage/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,8 @@ doi: 10.1093/gbe/evw179 and Steenwyk et al., PLOS Biology
phykit parsimony_informative_sites <alignment>
Options: |br|
*<alignment>*: first argument after function name should be an alignment file
*<alignment>*: first argument after function name should be an alignment file |br|
*\\-\\-cpu*: CPUs to use to accelerate calculation

|
Expand Down Expand Up @@ -549,7 +550,8 @@ and Evolution (2003), doi: 10.1016/S1055-7903(03)00057-5.
phykit relative_composition_variability <alignment>
Options: |br|
*<alignment>*: first argument after function name should be an alignment file
*<alignment>*: first argument after function name should be an alignment file |br|
*\\-\\-cpu*: CPUs to use to accelerate calculation

|
Expand Down
40 changes: 28 additions & 12 deletions phykit/phykit.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,7 @@ def alignment_length_no_gaps(argv):
),
)
parser.add_argument("alignment", type=str, help=SUPPRESS)
parser.add_argument("--cpu", type=str, help=SUPPRESS)
parser.add_argument("--cpu", type=int, help=SUPPRESS)
args = parser.parse_args(argv)
AlignmentLengthNoGaps(args).run()

Expand Down Expand Up @@ -606,7 +606,7 @@ def alignment_recoding(argv):

parser.add_argument("alignment", type=str, help=SUPPRESS)
parser.add_argument("-c", "--code", type=str, help=SUPPRESS)
parser.add_argument("--cpu", type=str, help=SUPPRESS)
parser.add_argument("--cpu", type=int, help=SUPPRESS)
args = parser.parse_args(argv)
AlignmentRecoding(args).run()

Expand Down Expand Up @@ -883,7 +883,7 @@ def pairwise_identity(argv):
parser.add_argument(
"-e", "--exclude_gaps", action="store_true", required=False, help=SUPPRESS
)
parser.add_argument("--cpu", type=str, help=SUPPRESS)
parser.add_argument("--cpu", type=int, help=SUPPRESS)
args = parser.parse_args(argv)
PairwiseIdentity(args).run()

Expand Down Expand Up @@ -920,17 +920,21 @@ def parsimony_informative_sites(argv):
pk_parsimony_informative_sites, pk_pis
Usage:
phykit parsimony_informative_sites <alignment>
phykit parsimony_informative_sites <alignment> [--cpu <cpu>]
Options
=====================================================
<alignment> first argument after
function name should be
an alignment file
an alignment file
--cpu CPUs to use to
accelerate calculation
"""
),
)
parser.add_argument("alignment", type=str, help=SUPPRESS)
parser.add_argument("--cpu", type=int, help=SUPPRESS)
args = parser.parse_args(argv)
ParsimonyInformative(args).run()

Expand All @@ -953,23 +957,27 @@ def rcv(argv):
RCV is calculated following Phillips and Penny, Molecular Phylogenetics
and Evolution (2003), doi: 10.1016/S1055-7903(03)00057-5.
Aliases:
Aliases:
relative_composition_variability, rel_comp_var, rcv
Command line interfaces:
pk_relative_composition_variability, pk_rel_comp_var, pk_rcv
Usage:
phykit relative_composition_variability <alignment>
phykit relative_composition_variability <alignment> [--cpu <cpu>]
Options
=====================================================
<alignment> first argument after
<alignment> first argument after
function name should be
an alignment file
an alignment file
--cpu CPUs to use to
accelerate calculation
"""
),
)
parser.add_argument("alignment", type=str, help=SUPPRESS)
parser.add_argument("--cpu", type=int, help=SUPPRESS)
args = parser.parse_args(argv)
RelativeCompositionVariability(args).run()

Expand All @@ -996,17 +1004,21 @@ def rcvt(argv):
pk_relative_composition_variability_taxon, pk_rel_comp_var_taxon, pk_rcvt
Usage:
phykit relative_composition_variability_taxon <alignment>
phykit relative_composition_variability_taxon <alignment> [--cpu <cpu>]
Options
=====================================================
<alignment> first argument after
function name should be
an alignment file
an alignment file
--cpu CPUs to use to
accelerate calculation
"""
),
)
parser.add_argument("alignment", type=str, help=SUPPRESS)
parser.add_argument("--cpu", type=int, help=SUPPRESS)
args = parser.parse_args(argv)
RelativeCompositionVariabilityTaxon(args).run()

Expand Down Expand Up @@ -1034,7 +1046,7 @@ def rename_fasta_entries(argv):
Usage:
phykit rename_fasta_entries <fasta> -i/--idmap <idmap>
[-o/--output <output_file>]
[-o/--output <output_file> --cpu <cpu>]
Options
=====================================================
Expand All @@ -1052,12 +1064,16 @@ def rename_fasta_entries(argv):
name as the input file with
the suffix ".renamed.fa" added
to it.
--cpu CPUs to use to
accelerate calculation
"""
),
)
parser.add_argument("fasta", type=str, help=SUPPRESS)
parser.add_argument("-i", "--idmap", type=str, help=SUPPRESS)
parser.add_argument("-o", "--output", type=str, required=False, help=SUPPRESS)
parser.add_argument("--cpu", type=int, help=SUPPRESS)
args = parser.parse_args(argv)
RenameFastaEntries(args).run()

Expand Down
47 changes: 29 additions & 18 deletions phykit/services/alignment/base.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from collections import Counter
import multiprocessing
from multiprocessing import cpu_count
from multiprocessing import Pool
import sys
from typing import Dict, List

from typing import List
from Bio.Seq import Seq

from ..base import BaseService
from ...helpers.files import (
Expand Down Expand Up @@ -45,9 +47,6 @@ def __init__(
self.exclude_gaps = exclude_gaps

def get_alignment_and_format(self):
"""
automatic file type determination
"""
try:
return get_alignment_and_format_helper(self.alignment_file_path)
except FileNotFoundError:
Expand All @@ -58,43 +57,55 @@ def get_alignment_and_format(self):
def calculate_rcv(self) -> float:
alignment, _, _ = self.get_alignment_and_format()
aln_len = alignment.get_alignment_length()

concat_seq = []
num_records = len(alignment)

concat_seq = "".join(str(record.seq) for record in alignment)

total_counts = Counter(concat_seq)

average_d = {
seq: total_counts[seq] / num_records for seq in total_counts
}

indiv_rcv_values = []
cpu = self.set_cpu()

for record in alignment:
record_counts = Counter(record.seq)
temp_rcv = sum(
abs(
record_counts[seq_letter] - average_d[seq_letter]
) for seq_letter in total_counts
with Pool(cpu) as pool:
indiv_rcv_values = pool.starmap(
self.calculate_indiv_rcv,
[
(record.seq, average_d, total_counts, aln_len, num_records)
for record in alignment
]
)
indiv_rcv_values.append(temp_rcv / (num_records * aln_len))

relative_composition_variability = sum(indiv_rcv_values)

return relative_composition_variability

def calculate_indiv_rcv(
self,
seq: Seq,
average_d: Dict[str, float],
total_counts: Counter,
aln_len: int,
num_records: int,
) -> float:
record_counts = Counter(seq)
temp_rcv = sum(
abs(record_counts[seq_letter] - average_d[seq_letter])
for seq_letter in total_counts
)
return temp_rcv / (num_records * aln_len)

def get_gap_chars(is_protein: bool) -> List[str]:
if is_protein:
return ["-", "?", "*", "X", "x"]
else:
return ["-", "?", "*", "X", "x", "N", "n"]

def set_cpu(self) -> int:
cpu_available = multiprocessing.cpu_count()
cpu_available = cpu_count()

cpu = int(self.cpu[0]) if self.cpu[0] else multiprocessing.cpu_count()
cpu = int(self.cpu[0]) if self.cpu[0] else cpu_count()

if cpu > cpu_available:
return cpu_available
Expand Down
8 changes: 2 additions & 6 deletions phykit/services/alignment/compositional_bias_per_site.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,28 +45,24 @@ def calculate_compositional_bias_per_site(
List[Power_divergenceResult],
List[Union[float, str]],
]:
# get aln length
aln_len = alignment.get_alignment_length()

stat_res = []
p_vals = []
nan_idx = []

for idx in range(aln_len):
# Count occurrences of each character at site idx
num_occurrences = self.get_number_of_occurrences_per_character(alignment, idx)
num_occurrences = \
self.get_number_of_occurrences_per_character(alignment, idx)

# Perform chi-square test on the counts
chisquare_res = chisquare(num_occurrences)
stat_res.append(chisquare_res)

# Collect p-values and track NaN values
if str(chisquare_res.pvalue) != "nan":
p_vals.append(chisquare_res.pvalue)
else:
nan_idx.append(idx)

# Correct p-values using false discovery control
p_vals_corrected = list(false_discovery_control(p_vals))

# Reinsert "nan" values at the appropriate positions
Expand Down
1 change: 1 addition & 0 deletions phykit/services/alignment/create_concatenation_matrix.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys
from textwrap import dedent
from typing import Dict, List, Tuple

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

Expand Down
2 changes: 2 additions & 0 deletions phykit/services/alignment/dna_threader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import sys
from typing import Dict, List

from Bio import SeqIO
from Bio.Seq import Seq

from .base import Alignment


Expand Down
1 change: 0 additions & 1 deletion phykit/services/alignment/evolutionary_rate_per_site.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def calculate_evolutionary_rate_per_site(

gap_chars = self.get_gap_chars()

# calculate PIC for each site in the alignment
for idx in range(aln_len):
num_occurrences = self.get_number_of_occurrences_per_character(
alignment, idx, gap_chars
Expand Down
31 changes: 22 additions & 9 deletions phykit/services/alignment/parsimony_informative_sites.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import Counter
from multiprocessing import Pool
from typing import Dict, Tuple

from Bio.Align import MultipleSeqAlignment
Expand All @@ -12,9 +13,8 @@ def __init__(self, args) -> None:

def run(self):
alignment, _, _ = self.get_alignment_and_format()
pi_sites, aln_len, pi_sites_per = self.calculate_parsimony_informative_sites(
alignment
)
pi_sites, aln_len, pi_sites_per = \
self.calculate_parsimony_informative_sites(alignment)

print(f"{pi_sites}\t{aln_len}\t{round(pi_sites_per, 4)}")

Expand All @@ -36,21 +36,34 @@ def is_parsimony_informative(
self,
num_occurrences: Counter,
) -> bool:
informative_char_count = sum(1 for count in num_occurrences.values() if count >= 2)
informative_char_count = sum(
1 for count in num_occurrences.values() if count >= 2
)
return informative_char_count >= 2

def calculate_parsimony_informative_sites(
self,
alignment: MultipleSeqAlignment,
) -> Tuple[int, int, float]:
cpu = self.set_cpu()

aln_len = alignment.get_alignment_length()
pi_sites = 0

for idx in range(aln_len):
num_occurrences = self.get_number_of_occurrences_per_character(alignment, idx)
if self.is_parsimony_informative(num_occurrences):
pi_sites += 1
with Pool(cpu) as pool:
results = pool.map(
self.check_site_parsimony_informative,
[(alignment, idx) for idx in range(aln_len)]
)

pi_sites = sum(results)
pi_sites_per = (pi_sites / aln_len) * 100

return pi_sites, aln_len, pi_sites_per

def check_site_parsimony_informative(
self,
args: Tuple[MultipleSeqAlignment, int]
) -> int:
alignment, idx = args
num_occurrences = self.get_number_of_occurrences_per_character(alignment, idx)
return 1 if self.is_parsimony_informative(num_occurrences) else 0
2 changes: 1 addition & 1 deletion phykit/services/alignment/rcv.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ def run(self):
print(round(relative_composition_variability, 4))

def process_args(self, args):
return dict(alignment_file_path=args.alignment)
return dict(alignment_file_path=args.alignment, cpu=args.cpu)
Loading

0 comments on commit a4a2e91

Please sign in to comment.