Skip to content
This repository has been archived by the owner on Nov 28, 2023. It is now read-only.

debug deeprank for models with no feature value #242

Merged
merged 26 commits into from
Nov 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 66 additions & 59 deletions deeprank/learn/DataSet.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ def __init__(self, train_database, valid_database=None, test_database=None,
target_ordering=None,
dict_filter=None, pair_chain_feature=None,
transform_to_2D=False, projection=0,
grid_shape=None,
clip_features=True, clip_factor=1.5,
rotation_seed=None,
tqdm=False,
Expand All @@ -60,10 +59,13 @@ def __init__(self, train_database, valid_database=None, test_database=None,
mapfly (bool): do we compute the map in the batch
preparation or read them

grid_info(dict): grid information to map the feature on the
fly. if None the original grid points are used.
grid_info(dict): grid information to map the feature.
If None the original grid points are used.
The dict contains:
- 'number_of_points", the shape of grid
- 'resolution', the resolution of grid, unit in A
Example:
{'number_of_points': [X,Y,Z], 'resolution': [X,Y,Z]}
{'number_of_points': [10, 10, 10], 'resolution': [3, 3, 3]}

use_rotation (int): number of rotations to use.
Example: 0 (use only original data)
Expand Down Expand Up @@ -105,9 +107,6 @@ def __init__(self, train_database, valid_database=None, test_database=None,
projection (int): Projection axis from 3D to 2D:
Mapping: 0 -> yz, 1 -> xz, 2 -> xy
Default = 0
grid_shape (None or tuple(int), optional):
Shape of the grid in the hdf5 file. Is not necessary
if the grid points are still present in the HDF5 file.
clip_features (bool, optional):
Remove too large values of the grid.
Can be needed for native complexes where the coulomb
Expand All @@ -128,7 +127,10 @@ def __init__(self, train_database, valid_database=None, test_database=None,
>>> test_database = None,
>>> chain1='C',
>>> chain2='D',
>>> grid_shape=(30,30,30),
>>> grid_info = {
>>> 'number_of_points': (10, 10, 10),
>>> 'resolution': (3, 3, 3)
>>> },
>>> select_feature = {
>>> 'AtomicDensities': 'all',
>>> 'Features': [
Expand Down Expand Up @@ -165,6 +167,7 @@ def __init__(self, train_database, valid_database=None, test_database=None,
# map generation
self.mapfly = mapfly
self.grid_info = grid_info
self._grid_shape = None

# data agumentation
if self.mapfly:
Expand All @@ -185,7 +188,6 @@ def __init__(self, train_database, valid_database=None, test_database=None,
# shape of the data
self.input_shape = None
self.data_shape = None
self.grid_shape = grid_shape

# the possible pairing of the ind features
self.pair_chain_feature = pair_chain_feature
Expand Down Expand Up @@ -291,7 +293,7 @@ def process_dataset(self):
self.get_input_shape()

# get renormalization factor
if self.normalize_features or self.normalize_targets:
if self.normalize_features or self.normalize_targets or self.clip_features:
if self.mapfly:
self.compute_norm()
else:
Expand Down Expand Up @@ -811,7 +813,6 @@ def get_grid_shape(self):
the HDF5 file
"""
if self.mapfly is False:

fname = self.train_database[0]
fh5 = h5py.File(fname, 'r')
mol = list(fh5.keys())[0]
Expand All @@ -820,29 +821,22 @@ def get_grid_shape(self):
mol_data = fh5.get(mol)

# get the grid size
if self.grid_shape is None:

if 'grid_points' in mol_data:
nx = mol_data['grid_points']['x'].shape[0]
ny = mol_data['grid_points']['y'].shape[0]
nz = mol_data['grid_points']['z'].shape[0]
self.grid_shape = (nx, ny, nz)

else:
raise ValueError(
f'Impossible to determine sparse grid shape.\n '
f'Specify argument grid_shape=(x,y,z)')
if 'grid_points' in mol_data:
nx = mol_data['grid_points']['x'].shape[0]
ny = mol_data['grid_points']['y'].shape[0]
nz = mol_data['grid_points']['z'].shape[0]
self._grid_shape = (nx, ny, nz)

fh5.close()

elif self.grid_info is not None:
self.grid_shape = self.grid_info['number_of_points']
fh5.close()

else:
raise ValueError(
f'Impossible to determine sparse grid shape.\n'
f'If you are not loading a pretrained model, '
f' specify grid_shape or grid_info')
if self._grid_shape is None:
if self.grid_info is not None:
self._grid_shape = self.grid_info['number_of_points']
else:
raise ValueError(
f'Impossible to determine sparse grid shape.\n'
f'If you are not loading a pretrained model, '
f' specify argument "grid_info".')

def compute_norm(self):
"""compute the normalization factors."""
Expand Down Expand Up @@ -897,7 +891,7 @@ def compute_norm(self):
self.target_min = self.param_norm['targets'].min[0]
self.target_max = self.param_norm['targets'].max[0]

logger.info(self.target_min, self.target_max)
logger.info(f'{self.target_min}, {self.target_max}')

def get_norm(self):
"""Get the normalization values for the features."""
Expand Down Expand Up @@ -940,7 +934,7 @@ def _read_norm(self):
# if the file doesn't exist we create it
if not os.path.isfile(fdata):
logger.info(f" Computing norm for {f5}")
norm = NormalizeData(f5, shape=self.grid_shape)
norm = NormalizeData(f5, shape=self._grid_shape)
norm.get()

# read the data
Expand Down Expand Up @@ -1045,6 +1039,7 @@ def _normalize_feature(self, feature):
) / self.feature_std[ic]
return feature


def _clip_feature(self, feature):
"""Clip the value of the features at +/- mean + clip_factor * std.
Args:
Expand All @@ -1055,10 +1050,12 @@ def _clip_feature(self, feature):

w = self.clip_factor
for ic in range(self.data_shape[0]):
minv = self.feature_mean[ic] - w * self.feature_std[ic]
maxv = self.feature_mean[ic] + w * self.feature_std[ic]
feature[ic] = np.clip(feature[ic], minv, maxv)
#feature[ic] = self._mad_based_outliers(feature[ic],minv,maxv)
if len(feature[ic]) > 0:
minv = self.feature_mean[ic] - w * self.feature_std[ic]
maxv = self.feature_mean[ic] + w * self.feature_std[ic]
if minv != maxv:
feature[ic] = np.clip(feature[ic], minv, maxv)
#feature[ic] = self._mad_based_outliers(feature[ic],minv,maxv)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is that line commented ? should we simply remove it ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No idea, this line was already commented in the code

return feature

@staticmethod
Expand Down Expand Up @@ -1160,15 +1157,18 @@ def load_one_molecule(self, fname, mol=None):
mat = sparse.FLANgrid(sparse=True,
index=data['index'][:],
value=data['value'][:],
shape=self.grid_shape).to_dense()
shape=self._grid_shape).to_dense()
else:
mat = data['value'][:]

# append to the list of features
feature.append(mat)

# get the target value
target = mol_data.get('targets/' + self.select_target)[()]
try:
target = mol_data.get('targets/' + self.select_target)[()]
except Exception:
logger.exception(f'No target value for: {fname} - not required for the test set')

# close
fh5.close()
Expand Down Expand Up @@ -1214,7 +1214,10 @@ def map_one_molecule(self, fname, mol=None, angle=None, axis=None):
feature += data

# get the target value
target = mol_data.get('targets/' + self.select_target)[()]
try:
target = mol_data.get('targets/' + self.select_target)[()]
except Exception:
logger.exception(f'No target value for: {fname} - not required for the test set')

# close
fh5.close()
Expand Down Expand Up @@ -1432,30 +1435,34 @@ def map_feature(self, feat_names, mol_data, grid, npts, angle, axis):
tmp_feat_vect = [np.zeros(npts), np.zeros(npts)]
data = np.array(mol_data['features/' + name][()])

chain = data[:, 0]
pos = data[:, 1:4]
feat_value = data[:, 4]
if data.shape[0]==0:
logger.warning(f'No {name} retrieved at the protein/protein interface')

if angle is not None:
pos = pdb2sql.transform.rot_xyz_around_axis(
pos, axis, angle, center)
else:
chain = data[:, 0]
pos = data[:, 1:4]
feat_value = data[:, 4]

if angle is not None:
pos = pdb2sql.transform.rot_xyz_around_axis(
pos, axis, angle, center)

if __vectorize__ or __vectorize__ == 'both':
if __vectorize__ or __vectorize__ == 'both':

for chainID in [0, 1]:
tmp_feat_vect[chainID] = np.sum(
vmap(pos[chain == chainID, :],
feat_value[chain == chainID]),
0)
for chainID in [0, 1]:
tmp_feat_vect[chainID] = np.sum(
vmap(pos[chain == chainID, :],
feat_value[chain == chainID]),
0)

if not __vectorize__ or __vectorize__ == 'both':
if not __vectorize__ or __vectorize__ == 'both':

for chainID, xyz, val in zip(chain, pos, feat_value):
tmp_feat_ser[int(chainID)] += \
self._featgrid(xyz, val, grid, npts)
for chainID, xyz, val in zip(chain, pos, feat_value):
tmp_feat_ser[int(chainID)] += \
self._featgrid(xyz, val, grid, npts)

if __vectorize__ == 'both':
assert np.allclose(tmp_feat_ser, tmp_feat_vect)
if __vectorize__ == 'both':
assert np.allclose(tmp_feat_ser, tmp_feat_vect)

if __vectorize__:
feat += tmp_feat_vect
Expand Down
Loading