-
Notifications
You must be signed in to change notification settings - Fork 0
/
muscle2phylip_path.py
95 lines (79 loc) · 3.46 KB
/
muscle2phylip_path.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python3
# -*- encoding:utf-8 -*-
# @Author : Haoran Pan
# date: 2021/4/19
import sys
from path import Path
from Bio.Seq import Seq
from Bio import SeqIO
import argparse
import SetLog as log
__author__ = 'Haoran Pan'
__mail__ = 'haoran_pan@qq.com'
__date__ = '20210425'
__version__ = 'v1.2'
def get_msa(filepath):
msa_files = [file for file in Path(filepath).files() if file.endswith('.msa')]
return msa_files
def parse_msa(msa_files):
gene2seq = {}
for msa in msa_files:
for seq in SeqIO.parse(msa,'fasta'):
gene2seq[seq.id] = seq.seq
return gene2seq
def msa2phylip(gene2seq,species_list,file_type):
species_length = []
species2seq = {}
for species in species_list:
if not '/' in species:
concat_seq = Seq('')
for key in gene2seq.keys():
if key.startswith(species):
concat_seq += gene2seq[key]
species_length.append(len(concat_seq))
species2seq[species] = concat_seq
else:
name1,name2 = species.split('/')
concat_seq = Seq('')
for key in gene2seq.keys():
if key.startswith(name1):
concat_seq += gene2seq[key]
elif key.startswith(name2):
concat_seq += gene2seq[key]
species_length.append(len(concat_seq))
species2seq[name1] = concat_seq
length_uniq = list(set(species_length))
if len(length_uniq) == 1:
with open('SingleCopy.{}.phylip'.format(file_type),'w') as f:
f.write('{}\t{}\n'.format(len(species_list),length_uniq[0]))
for species in species_list:
if '/' in species:
name1, name2 = species.split('/')
# print(name1,species2seq[name1])
f.write('{} {}\n'.format(name1,species2seq[name1]))
else:
# print(species,species2seq[species])
f.write('{} {}\n'.format(species,species2seq[species]))
else:
species_length = [str(i) for i in species_length]
print('\t'.join(species_length))
log.error_out('Species super-gene length is inconsistent')
def main(args):
msapath = args.path
specieslist = args.species
file_type = args.type
msa_files = get_msa(msapath)
gene2seq = parse_msa(msa_files)
msa2phylip(gene2seq,specieslist,file_type)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
prog=sys.argv[0],
formatter_class=argparse.RawTextHelpFormatter,
description='''Combine the results of multiple sequence alignments end to end to form a super-gene to generate a phylip file''',
usage="python {} -p msa_path -s ID1 ID2 ID3..\n\nNote:ID is the ID of the gene in the genome of each species!!!\nGenerated file: SingleCopy.pep.phylip".format(sys.argv[0]),
epilog= 'author:\t{0}\nmail:\t{1}\ndate:\t{2}\nversion:\t{3}'.format(__author__,__mail__,__date__,__version__))
parser.add_argument('-p','--path',required=True,help='Input the path where the msa file is located')
parser.add_argument('-t','--type',required=True,choices=['pep','cds'],help='Specify the type of input sequence')
parser.add_argument('-s','--species',required=True,nargs='+',help='Input the gene ID prefix for each species genome')
args = parser.parse_args()
main(args)