-
Notifications
You must be signed in to change notification settings - Fork 1
/
processNeedle.py
89 lines (73 loc) · 2.51 KB
/
processNeedle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import pandas as pd
import argparse
from sklearn.cluster import KMeans
def main():
parser = argparse.ArgumentParser(description="")
parser.add_argument("-i", "--pairwise_aligned_fasta", required=True)
parser.add_argument("-n", "--cluster_number", required=True)
parser.add_argument("-o", "--output_path", required=True)
args = parser.parse_args()
file = args.pairwise_aligned_fasta
output = args.output_path
n = args.cluster_number
n = int(n)
file = open(file, 'r')
data_table = {}
key = ''
# e_count = 0
# d_count = 0
# k_count = 0
seqlen = -1
for i in file:
if i.startswith('# 2: '):
key = i.split('# 2: ')[-1][:-1]
data_table[key] = {'consensus':'','self':'','marker':''}
if i.startswith('consensus'):
seq = i[21:].split(' ')[0]
seqlen = len(seq)
data_table[key]['consensus'] += seq
if i.startswith('CHM'):
item = i[21:21 + seqlen]
data_table[key]['self'] += item
if i.startswith('HG'):
item = i[21:21 + seqlen]
data_table[key]['self'] += item
if i.startswith('NA'):
item = i[21:21 + seqlen]
data_table[key]['self'] += item
if i.startswith('RY'):
item = i[21:21 + seqlen]
data_table[key]['self'] += item
if i.startswith(' '):
item = i[21:21 + seqlen]
data_table[key]['marker'] += item
file.close()
datamatrix = {}
col = []
for i in data_table.keys():
datamatrix[i] = []
for j in range(len(data_table[i]['consensus'])):
if data_table[i]['consensus'][j] == '-':
continue
marker = data_table[i]['marker'][j]
if marker == '|':
datamatrix[i].append(0)
else:
datamatrix[i].append(1)
col = []
for j in range(len(datamatrix[i])):
col.append('f' + str(j))
datamatrix = pd.DataFrame(datamatrix).T
# datamatrix.columns = col
# datamatrix.to_csv(output, sep='\t')
# ll
kmeans =KMeans(n_clusters=n, random_state=10)
kmeans.fit(datamatrix)
predict = kmeans.predict(datamatrix)
print(predict)
predict = pd.DataFrame(predict,index=datamatrix.index)
print(predict)
datamatrix.to_csv(output + 'datamatrix.'+ str(n) + '.xls', sep='\t')
predict.to_csv(output + 'datamatrix.' + str(n) + '.label.xls', sep='\t')
if __name__ == '__main__':
main()