-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path20180622_pocpPipeline.py
102 lines (95 loc) · 3.12 KB
/
20180622_pocpPipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# coding: utf-8
# copyright: qyliang
# time: 06-15-2018
import os
import pandas as pd
from collections import OrderedDict
# 定义获取每个物种的总蛋白数量
def Total(filename):
os.chdir(rawFilePath)
file = open(filename)
text = file.read()
proteinTotal = text.count('>')
file.close()
return proteinTotal
# 定义计算每个物种注释的蛋白含有的氨基酸个数
def aaTotal(filename):
os.chdir(rawFilePath)
file = open(filename)
text = file.read()
file.close()
ltsp = text.split('>')
aaDict = OrderedDict()
for line in ltsp:
if line != '':
firstBreakIndex = line.index('\n')
key = line[:firstBreakIndex].split(' ')[0]
aaDict[key] = len(line[firstBreakIndex:])
else:
pass
return aaDict
# 定义构建数据库函数
def creatdb():
os.chdir(scriptPath)
proc1 = r'.\makeblastdb.exe -dbtype prot -parse_seqids -in .\Rawdata\\'
proc2 = r' -out .\Database\\'
for fn in fnlist:
process = proc1 + fn + proc2 + fn.split('.')[0]
print('='*10,process)
os.system(process)
# 定义流程化blastp函数
def blast():
proc1 = r'.\blastp.exe -evalue 1e-5 -max_target_seqs 1 -outfmt 6'
proc2 = r' -query .\Rawdata\\'
proc3 = r' -db .\Database\\'
proc4 = r' -out .\Result\\'
for fnA in fnlist:
for fnB in fnlist:
os.chdir(scriptPath)
if (fnA != fnB) and (fnlist.index(fnA) < fnlist.index(fnB)):
A = fnA
B = fnB
# A2B
processAB = proc1 + proc2 + A + proc3 + B.split('.')[0] + proc4 + A.split('.')[0] + 'TO' + B.split('.')[0] + '.tab'
# B2A
processBA = proc1 + proc2 + B + proc3 + A.split('.')[0] + proc4 + B.split('.')[0] + 'TO' + A.split('.')[0] + '.tab'
print('='*10,processAB)
os.system(processAB)
print('='*10,processBA)
os.system(processBA)
os.chdir(resultPath)
resultFileName1 = A.split('.')[0] + 'TO' + B.split('.')[0] + '.tab'
resultFileName2 = B.split('.')[0] + 'TO' + A.split('.')[0] + '.tab'
dframe1 = pd.read_csv(resultFileName1, sep = '\t', header = None)
dframe2 = pd.read_csv(resultFileName2, sep = '\t', header = None)
D1 = aaTotal(A)
for i in range(dframe1.shape[0]):
dframe1.iloc[i,11] = (dframe1.iloc[i,3])/(D1[dframe1.iloc[i,0]])
cdframe1 = dframe1[dframe1.iloc[:,2] > 40]
cdframe11 = cdframe1[cdframe1.iloc[:,11] > 0.5]
C1 = cdframe11.shape[0]
D2 = aaTotal(B)
for i in range(dframe2.shape[0]):
dframe2.iloc[i,11] = (dframe2.iloc[i,3])/(D2[dframe2.iloc[i,0]])
cdframe2 = dframe2[dframe2.iloc[:,2] > 40]
cdframe22 = cdframe2[cdframe2.iloc[:,11] > 0.5]
C2 = cdframe22.shape[0]
T1 = Total(A)
T2 = Total(B)
POCP = (C1 + C2)/(T1 + T2)
pocpResult.append(A.split('.')[0] + '\t' + B.split('.')[0] + '\t' + str(POCP) + '\n')
print(pocpResult)
# 路径部分需要根据自己的安装的情况调整
rawFilePath = r'E:\POCP\Rawdata'
scriptPath = r'E:\POCP'
resultPath = r'E:\POCP\Result'
fnlist = os.listdir(rawFilePath)
pocpResult = []
creatdb()
blast()
os.chdir(scriptPath)
with open('POCP.txt','w') as file:
for line in pocpResult:
file.write(line)
file.close()
print('='*50)