Skip to content

Commit

Permalink
added CPU support to variable sites function
Browse files Browse the repository at this point in the history
  • Loading branch information
JLSteenwyk committed Sep 19, 2024
1 parent e441d34 commit f13447b
Show file tree
Hide file tree
Showing 10 changed files with 87 additions and 72 deletions.
11 changes: 6 additions & 5 deletions docs/usage/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -618,13 +618,13 @@ Acids Research (1999), doi: 10.1093/nar/27.13.2682.

.. code-block:: shell
phykit sum_of_pairs_score <alignment> --reference <reference_alignment>
phykit sum_of_pairs_score <alignment> --reference <reference_alignment> [--cpu <cpu>]
Options: |br|
*<alignment>*: first argument after function name should be a query
fasta alignment file to be scored for accuracy |br|
*-r/\\-\\-reference*: reference alignment to compare the query alignment
to
*-r/\\-\\-reference*: reference alignment to compare the query alignment to |br|
*\\-\\-cpu*: CPUs to use to accelerate calculation

|
Expand All @@ -649,10 +649,11 @@ doi: 10.1093/gbe/evw179.

.. code-block:: shell
phykit variable_sites <alignment>
phykit variable_sites <alignment> [--cpu <cpu>]
Options: |br|
*<alignment>*: first argument after function name should be an alignment file
*<alignment>*: first argument after function name should be an alignment file |br|
*\\-\\-cpu*: CPUs to use to accelerate calculation

|
Expand Down
12 changes: 10 additions & 2 deletions phykit/phykit.py
Original file line number Diff line number Diff line change
Expand Up @@ -1100,7 +1100,7 @@ def sum_of_pairs_score(argv):
pk_sum_of_pairs_score, pk_sops, pk_sop
Usage:
phykit sum_of_pairs_score <fasta> -r/--reference <ref.aln>
phykit sum_of_pairs_score <fasta> -r/--reference <ref.aln> [--cpu <cpu>]
Options
=====================================================
Expand All @@ -1111,11 +1111,15 @@ def sum_of_pairs_score(argv):
-r/--reference reference fasta alignment to
compare query alignment to
--cpu CPUs to use to
accelerate calculation
"""
),
)
parser.add_argument("fasta", type=str, help=SUPPRESS)
parser.add_argument("-r", "--reference", type=str, help=SUPPRESS)
parser.add_argument("--cpu", type=int, help=SUPPRESS)
args = parser.parse_args(argv)
SumOfPairsScore(args).run()

Expand Down Expand Up @@ -1150,17 +1154,21 @@ def variable_sites(argv):
pk_variable_sites, pk_vs
Usage:
phykit variable_sites <alignment>
phykit variable_sites <alignment> [--cpu <cpu>]
Options
=====================================================
<alignment> first argument after
function name should be
an alignment file
--cpu CPUs to use to
accelerate calculation
"""
),
)
parser.add_argument("alignment", type=str, help=SUPPRESS)
parser.add_argument("--cpu", type=int, help=SUPPRESS)
args = parser.parse_args(argv)
VariableSites(args).run()

Expand Down
2 changes: 1 addition & 1 deletion phykit/services/alignment/parsimony_informative_sites.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def run(self):
print(f"{pi_sites}\t{aln_len}\t{round(pi_sites_per, 4)}")

def process_args(self, args) -> Dict[str, str]:
return dict(alignment_file_path=args.alignment)
return dict(alignment_file_path=args.alignment, cpu=args.cpu)

def get_number_of_occurrences_per_character(
self,
Expand Down
30 changes: 0 additions & 30 deletions phykit/services/alignment/rcvt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,36 +5,6 @@
from .base import Alignment


# class RelativeCompositionVariabilityTaxon(Alignment):
# def __init__(self, args) -> None:
# super().__init__(**self.process_args(args))

# def run(self):
# alignment, _, _ = self.get_alignment_and_format()
# aln_len = alignment.get_alignment_length()
# num_records = len(alignment)

# concat_seq = "".join(str(record.seq) for record in alignment)
# total_counts = Counter(concat_seq)

# average_d = {
# char: total_counts[char] / num_records for char in total_counts
# }

# for record in alignment:
# record_counts = Counter(record.seq)
# temp_rcv = \
# sum(
# abs(
# record_counts[seq_letter] - average_d[seq_letter]
# ) for seq_letter in total_counts
# )
# rcv_value = temp_rcv / (num_records * aln_len)
# print(f"{record.id}\t{round(rcv_value, 4)}")

# def process_args(self, args):
# return dict(alignment_file_path=args.alignment)

class RelativeCompositionVariabilityTaxon(Alignment):
def __init__(self, args) -> None:
super().__init__(**self.process_args(args))
Expand Down
1 change: 1 addition & 0 deletions phykit/services/alignment/rename_fasta_entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def process_args(self, args) -> Dict[str, str]:
fasta=args.fasta,
idmap=args.idmap,
output_file_path=output_file_path,
cpu=args.cpu
)

def load_idmap(self, idmap_file: str) -> Dict[str, str]:
Expand Down
58 changes: 43 additions & 15 deletions phykit/services/alignment/sum_of_pairs_score.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import itertools
from multiprocessing import Pool
from typing import Dict, List, Tuple

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

from .base import Alignment


Expand All @@ -27,29 +27,57 @@ def run(self):
print(round(number_of_matches / number_of_total_pairs, 4))

def process_args(self, args) -> Dict[str, str]:
return dict(fasta=args.fasta, reference=args.reference)
return dict(fasta=args.fasta, reference=args.reference, cpu=args.cpu)

def determine_number_of_matches_and_total_pairs(
self,
record_id_pairs: List[Tuple[str, str]],
reference_records: Dict[str, SeqRecord],
query_records: Dict[str, SeqRecord],
) -> Tuple[int, int]:
print(query_records)
cpu = self.set_cpu()
with Pool(cpu) as pool:
results = pool.starmap(
self.compare_pair,
[
(
first_in_pair,
second_in_pair,
reference_records,
query_records,
)
for first_in_pair, second_in_pair in record_id_pairs
]
)

number_of_matches = sum(result[0] for result in results)
number_of_total_pairs = sum(result[1] for result in results)

return number_of_matches, number_of_total_pairs

def compare_pair(
self,
first_in_pair: str,
second_in_pair: str,
reference_records: Dict[str, SeqRecord],
query_records: Dict[str, SeqRecord],
) -> Tuple[int, int]:
"""
Compare a pair of sequences and return the number of matches and total pairs.
"""
number_of_matches = 0
number_of_total_pairs = 0

for first_in_pair, second_in_pair in record_id_pairs:
ref_seq1 = reference_records[first_in_pair].seq
ref_seq2 = reference_records[second_in_pair].seq
query_seq1 = query_records[first_in_pair].seq
query_seq2 = query_records[second_in_pair].seq

for ref_res1, ref_res2, query_res1, query_res2 in zip(
ref_seq1, ref_seq2, query_seq1, query_seq2
):
number_of_total_pairs += 1
if ref_res1 == query_res1 and ref_res2 == query_res2:
number_of_matches += 1
ref_seq1 = reference_records[first_in_pair].seq
ref_seq2 = reference_records[second_in_pair].seq
query_seq1 = query_records[first_in_pair].seq
query_seq2 = query_records[second_in_pair].seq

for ref_res1, ref_res2, query_res1, query_res2 in zip(
ref_seq1, ref_seq2, query_seq1, query_seq2
):
number_of_total_pairs += 1
if ref_res1 == query_res1 and ref_res2 == query_res2:
number_of_matches += 1

return number_of_matches, number_of_total_pairs
33 changes: 20 additions & 13 deletions phykit/services/alignment/variable_sites.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from multiprocessing import Pool
from typing import Dict, Tuple

from Bio.Align import MultipleSeqAlignment
Expand All @@ -10,35 +11,41 @@ def __init__(self, args) -> None:
super().__init__(**self.process_args(args))

def run(self):
alignment, _, is_protein = self.get_alignment_and_format()
alignment, _, _ = self.get_alignment_and_format()
var_sites, aln_len, var_sites_per = \
self.calculate_variable_sites(alignment)

print(f"{var_sites}\t{aln_len}\t{round(var_sites_per, 4)}")

def process_args(self, args) -> Dict[str, str]:
return dict(alignment_file_path=args.alignment)
return dict(alignment_file_path=args.alignment, cpu=args.cpu)

def calculate_variable_sites(
self,
alignment: MultipleSeqAlignment
) -> Tuple[int, int, float]:
aln_len = alignment.get_alignment_length()

gap_chars = self.get_gap_chars()

var_sites = 0

for i in range(aln_len):
seq_at_position = [
residue.upper()
for residue in alignment[:, i]
if residue not in gap_chars
]
cpu = self.set_cpu()

if len(set(seq_at_position)) > 1:
var_sites += 1
with Pool(cpu) as pool:
results = pool.map(
self.check_site_variability,
[(alignment[:, i], gap_chars) for i in range(aln_len)]
)

var_sites = sum(results)
var_sites_per = (var_sites / aln_len) * 100

return var_sites, aln_len, var_sites_per

def check_site_variability(self, args: Tuple[str, set]) -> int:
seq_at_position, gap_chars = args
seq_at_position = [
residue.upper()
for residue in seq_at_position
if residue not in gap_chars
]

return 1 if len(set(seq_at_position)) > 1 else 0
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
class TestSumOfPairsScore(object):
@patch("builtins.print")
def test_sum_of_pairs_score_full_ref(self, mocked_print):
expected_result = 0.7714
expected_result = 0.4
testargs = [
"phykit",
"sum_of_pairs_score",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import pytest
import sys
from mock import patch, call
from pathlib import Path
from textwrap import dedent
import pytest
import sys

from phykit.phykit import Phykit

Expand Down Expand Up @@ -70,4 +69,4 @@ def test_variable_sites_incorrect_input_file(self, mocked_print):
Phykit()

assert pytest_wrapped_e.type == SystemExit
assert pytest_wrapped_e.value.code == 2
assert pytest_wrapped_e.value.code == 2
3 changes: 2 additions & 1 deletion tests/unit/services/alignment/test_variable_sites.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@

@pytest.fixture
def args():
kwargs = dict(alignment="/some/path/to/file.fa")
kwargs = dict(alignment="/some/path/to/file.fa", cpu=1)
return Namespace(**kwargs)


class TestVariableSites(object):
def test_init_sets_alignment_file_path(self, args):
vs = VariableSites(args)
assert vs.alignment_file_path == args.alignment
assert vs.cpu == (1,)
assert vs.output_file_path is None

def test_variable_sites(self, alignment_simple, args):
Expand Down

0 comments on commit f13447b

Please sign in to comment.