-
Notifications
You must be signed in to change notification settings - Fork 9
/
featurizers.py
95 lines (81 loc) · 3.11 KB
/
featurizers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import functools
from rdkit.Chem.Draw import SimilarityMaps
from rdkit.Chem import AllChem
import rdkit.Chem as Chem
import numpy as np
from tqdm import tqdm
from chemprop.data.data import MoleculeDatapoint
FP_SIZE = 2048
FP_RADIUS = 3
class Featurizer(object):
pass
class GraphFeaturizer(Featurizer):
def __init__(self, smis, targets):
self.smiles = smis
self.targets = targets
def prepare_x(self):
return [MoleculeDatapoint(
smiles=smi,
targets=[float(i) for i in targ]
) for smi, targ in zip(self.smiles, self.targets)]
class FingerprintFeaturizer(Featurizer):
def __init__(self, fp_size=FP_SIZE, radius=FP_RADIUS):
self.fp_size = fp_size
self.radius = radius
self.simmap_featurizer = functools.partial(
SimilarityMaps.GetMorganFingerprint,
radius=self.radius,
nBits=self.fp_size,
useChirality=True
)
def calc_fp(self, smi, bitInfo=False):
if bitInfo:
mol = Chem.MolFromSmiles(smi)
info = {}
fp = np.array(AllChem.GetMorganFingerprintAsBitVect(
mol,
radius=self.radius,
nBits=self.fp_size,
useChirality=True,
bitInfo=info,
), dtype='bool')
return fp, info
else:
return np.array(AllChem.GetMorganFingerprintAsBitVect(
Chem.MolFromSmiles(smi),
radius=self.radius,
nBits=self.fp_size,
useChirality=True,
), dtype='bool')
def prepare_x(self, df_data, bitInfo=False):
x = np.zeros((len(df_data), self.fp_size), dtype=np.bool)
if bitInfo:
info_all = []
for i, smi in enumerate(tqdm(df_data['smiles'])):
fp, info = self.calc_fp(smi, bitInfo=True)
x[i, :] = fp
info_all.append(info)
return x, info_all
else:
for i, smi in enumerate(tqdm(df_data['smiles'])):
x[i, :] = self.calc_fp(smi, bitInfo=False)
return x
class OneHotFeaturizer(Featurizer):
'''cycle_ids are 1-indexed'''
tags = ['library_id', 'cycle1', 'cycle2', 'cycle3']
def __init__(self, df_data):
self.offsets = {}
current_offset = -1
for lib_id, df_data_lib in df_data[self.tags].groupby('library_id'):
for cycnum in [1, 2, 3]:
self.offsets[(lib_id, cycnum)] = current_offset
current_offset += len(df_data_lib[f'cycle{cycnum}'].unique())
self.length = current_offset + 1
def prepare_x(self, df_data):
x = np.zeros((len(df_data), self.length), dtype=np.bool)
for i, indexed_row in enumerate(tqdm(df_data[self.tags].iterrows())):
_, (lib_id, cyc1, cyc2, cyc3) = indexed_row
x[i, self.offsets[lib_id, 1] + cyc1] = 1
x[i, self.offsets[lib_id, 2] + cyc2] = 1
x[i, self.offsets[lib_id, 3] + cyc3] = 1
return x