-
Notifications
You must be signed in to change notification settings - Fork 4
/
convert_chemprop_predict.py
122 lines (98 loc) · 3.94 KB
/
convert_chemprop_predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import argparse
import subprocess
import warnings
import os
warnings.filterwarnings(action='ignore')
parser = argparse.ArgumentParser(description='chemprop train convert arguments')
parser.add_argument(
'--testisf',
type=str,
default="input/test.csv",
metavar='TESTISF',
help='the name of the test id and smiles file (default: input/test.csv)')
parser.add_argument(
'--sc',
type=str,
default="kinase",
metavar='SC',
help='the name of the source checkpoint (default: kinase)')
cwd = os.getcwd()
project_file_path = "{}/TransferLearning4DTI".format(cwd.split("TransferLearning4DTI")[0])
training_files_path = "{}/TransferLearning4DTI/training_files".format(cwd.split("TransferLearning4DTI")[0])
def get_compound_ids(FileName, smiles_lst):
compound_lst = []
with open(FileName) as f:
lines = f.readlines()
for line in lines:
feature_lst = line.rstrip('\n').split(",")
if "compound" in line:
continue
if feature_lst[1] in smiles_lst:
compound_lst.append(feature_lst[0])
f.close()
return compound_lst
#get trained smiles from the csv file
def get_smiles_chemprop(FileName):
compound_lst = []
with open(FileName) as f:
lines = f.readlines()
for line in lines:
feature_lst = line.rstrip('\n').split(",")
if "smiles" in line:
continue
compound_lst.append(feature_lst[0])
f.close()
return compound_lst
def read_smiles(FileName):
smilesCompoundDict = {}
compoundSmileDict = {}
c = 0
with open(FileName) as f:
lines = f.readlines()
for line in lines:
if c == 0:
c += 1
continue
id_smiles_class = line.rstrip('\n').split(",")
smilesCompoundDict[id_smiles_class[1]] = id_smiles_class[0]
compoundSmileDict[id_smiles_class[0]] = id_smiles_class[1]
return smilesCompoundDict, compoundSmileDict
def read_csv_convert_to_tsv(csv_file_name, tsv_file_name, compoundSmileDict, compound_lst):
with open(tsv_file_name, 'w') as wf:
with open(csv_file_name) as f:
lines = f.readlines()
for line in lines:
if "smiles" in line:
continue
else:
csv_line = line.rstrip('\n').split(",")
comp_id = compoundSmileDict[csv_line[0]]
if comp_id in compound_lst:
wf.write(comp_id + "\t" + "\t".join([str(float(dim)) for dim in csv_line[1:]]) + "\n")
f.close()
wf.close()
def write_smiles(file_path, compoundSmilesDict):
wf = open(file_path, "w", encoding='utf-8')
wf.write("smiles\n")
for key, value in compoundSmilesDict.items():
wf.write(value + "\n")
wf.close()
if __name__ == '__main__':
args = parser.parse_args()
source_checkpoint = args.sc
test_file = args.testisf
smilesCompoundDict, compoundSmilesDict = read_smiles(test_file)
smiles_file = test_file.split(".")[0] + "_smiles.csv"
write_smiles(smiles_file, compoundSmilesDict)
test_chemprop_file = project_file_path + "/output/" + test_file.split(".")[0].split("/")[-1] + "_chemprop.csv"
print("chemprop_fingerprint is running")
cmdCommand = "chemprop_fingerprint --test_path " + smiles_file + " --checkpoint_path chemprop/" + source_checkpoint + "_checkpoints/model.pt " \
"--preds_path " + test_chemprop_file # specify your cmd command
# print(cmdCommand)
process = subprocess.Popen(cmdCommand.split())
output, error = process.communicate()
smiles_lst = get_smiles_chemprop(test_chemprop_file)
compound_lst = get_compound_ids(test_file, smiles_lst)
tsv_file_name = test_chemprop_file.split(".")[0] + ".tsv"
read_csv_convert_to_tsv(test_chemprop_file, tsv_file_name, smilesCompoundDict, compound_lst)
print("Test chemprop file is converted")