Skip to content

Commit

Permalink
Improved code clarity and readability
Browse files Browse the repository at this point in the history
  • Loading branch information
RubenVG02 committed Jul 26, 2024
1 parent 75cb0bc commit 4b207aa
Show file tree
Hide file tree
Showing 7 changed files with 494 additions and 527 deletions.
179 changes: 81 additions & 98 deletions src/cnn_affinity.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,157 +25,140 @@
int_smiles = dict(zip(elements_smiles, range(1, len(elements_smiles)+1)))
int_fasta = dict(zip(elements_fasta, range(1, len(elements_fasta)+1)))

def convert(arx=file_path):

#Function to convert all elements (both smiles and fasta) into int, in order to be trained in the model

smiles_w_numbers = [] # Smiles obtained with int_smiles[1] and the smiles of the df
for i in arx.smiles:
smiles_list = []
for elements in i: # Elements refers to the elements that make up elements_smile
try:
smiles_list.append(int_smiles[elements])
except:
pass
while (len(smiles_list) != max_smiles):
smiles_list.append(0)
def convert(file_path=file_path):

'''
Function to convert all elements (both smiles and fasta) into int, in order to be trained in the model
Parameters:
file_path (path): DataFrame containing the SMILES, FASTA and IC50 columns. Columns must be named "smiles", "sequence" and "IC50". This file is generated from src/fix_data_for_models.py
Returns:
smiles_w_numbers (list): List of SMILES converted to integers
fasta_w_numbers (list): List of FASTA converted to integers
'''

smiles_w_numbers = []
for i in file_path.smiles:
smiles_list = [int_smiles.get(element, 0) for element in i]
smiles_list.extend([0] * (max_smiles - len(smiles_list)))
smiles_w_numbers.append(smiles_list)

fasta_w_numbers = []
for i in arx.sequence:
fasta_list = []
for elements in i: # Elements fa referència a els elements que formen elements_smile
try:
fasta_list.append(int_fasta[elements])
except:
pass
while (len(fasta_list) != max_fasta):
fasta_list.append(0)
for i in file_path.sequence:
fasta_list = [int_fasta.get(element, 0) for element in i]
fasta_list.extend([0] * (max_fasta - len(fasta_list)))
fasta_w_numbers.append(fasta_list)

ic50_numeros = list(arx.IC50)
ic50_numeros = list(file_path.IC50)

return smiles_w_numbers, fasta_w_numbers, ic50_numeros


X_test_smile, X_test_fasta, T_test_IC50 = convert(arx[350000:])

X_test_smile, X_test_fasta, T_test_IC50 = convert(file_path[350000:])

def model_cnn():
# model to train

# kernel regularizer
regulatos = l2(0.001)

# model per a smiles
smiles_input = Input(
shape=(max_smiles,), dtype='int32', name='smiles_input')
embed = Embedding(input_dim=len(
elements_smiles)+1, input_length=max_smiles, output_dim=128)(smiles_input)
x = Conv1D(
filters=32, kernel_size=3, padding="SAME", input_shape=(50700, max_smiles))(embed)

def model_cnn(file_path=file_path):

'''
Function to train a model using CNN. The model is trained using the SMILES and FASTA sequences.
The model is trained using the IC50 values.
Parameters:
file_path (path): DataFrame containing the SMILES, FASTA and IC50 columns. Columns must be named "smiles", "sequence" and "IC50". This file is generated from src/fix_data_for_models.py
'''
regulator = l2(0.001)

# Model for SMILES
smiles_input = Input(shape=(max_smiles,), dtype='int32', name='smiles_input')
embed_smiles = Embedding(input_dim=len(elements_smiles)+1, input_length=max_smiles, output_dim=128)(smiles_input)
x = Conv1D(filters=32, kernel_size=3, padding="SAME", kernel_regularizer=regulator)(embed_smiles)
x = PReLU()(x)

x = Conv1D(filters=64, kernel_size=3, padding="SAME")(x)
x = BatchNormalization()(x)
x = PReLU()(x)
x = Conv1D(
filters=128, kernel_size=3, padding="SAME")(x)
x = Conv1D(filters=128, kernel_size=3, padding="SAME")(x)
x = BatchNormalization()(x)
x = PReLU()(x)
pool = GlobalMaxPooling1D()(
x) # maxpool to get a 1d vector
pool_smiles = GlobalMaxPooling1D()(x)

# model per fastas
# Model for FASTA
fasta_input = Input(shape=(max_fasta,), name='fasta_input')
embed2 = Embedding(input_dim=len(
elements_fasta)+1, input_length=max_fasta, output_dim=256)(fasta_input)
x2 = Conv1D(
filters=32, kernel_size=3, padding="SAME", input_shape=(50700, max_fasta))(embed2)
x2 = PReLU()(embed2)

x2 = Conv1D(
filters=64, kernel_size=3, padding="SAME")(x2)
embed_fasta = Embedding(input_dim=len(elements_fasta)+1, input_length=max_fasta, output_dim=256)(fasta_input)
x2 = Conv1D(filters=32, kernel_size=3, padding="SAME")(embed_fasta)
x2 = PReLU()(x2)

x2 = Conv1D(filters=64, kernel_size=3, padding="SAME")(x2)
x2 = BatchNormalization()(x2)
x2 = PReLU()(x2)
x2 = Conv1D(
filters=128, kernel_size=3, padding="SAME")(x2)
x2 = Conv1D(filters=128, kernel_size=3, padding="SAME")(x2)
x2 = BatchNormalization()(x2)
x2 = PReLU()(x2)
pool2 = GlobalMaxPooling1D()(
x2) #maxpool to get a 1d vector

junt = concatenate(inputs=[pool, pool2])

# dense
pool_fasta = GlobalMaxPooling1D()(x2)

de = Dense(units=1024, activation="relu")(junt)
dr = Dropout(0.3)(de)
de = Dense(units=1024, activation="relu")(dr)
dr = Dropout(0.3)(de)
de2 = Dense(units=512, activation="relu")(dr)
# Concatenate and Dense layers
combined = concatenate([pool_smiles, pool_fasta])
dense = Dense(units=1024, activation="relu")(combined)
dense = Dropout(0.3)(dense)
dense = Dense(units=1024, activation="relu")(dense)
dense = Dropout(0.3)(dense)
dense = Dense(units=512, activation="relu")(dense)

# output
output = Dense(1, activation="relu", name="output")(dense)

output = Dense(
1, activation="relu", name="output", kernel_initializer="normal")(de2)
model = tf.keras.models.Model(inputs=[smiles_input, fasta_input], outputs=[output])

model = tf.keras.models.Model(
inputs=[smiles_input, fasta_input], outputs=[output])


# funció per mirar la precisió del model (serà la nostra metric)
def r2_score(y_true, y_pred):
SS_res = K.sum(K.square(y_true - y_pred))
SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
return (1-SS_res/(SS_tot)+K.epsilon())
return (1 - SS_res / (SS_tot + K.epsilon()))

model.load_weights(
r"")
# In case you want to continue training a model

model.compile(optimizer="adam",
loss={'output': "mean_squared_logarithmic_error"},
metrics={'output': r2_score})

# To do checkpoints
loss={'output': "mean_squared_logarithmic_error"},
metrics={'output': r2_score})

save_model_path = "models/cnn_model.hdf5"
checkpoint = ModelCheckpoint(save_model_path,
monitor='val_loss',
verbose=1,
save_best_only=True)
checkpoint = ModelCheckpoint(save_model_path, monitor='val_loss', verbose=1, save_best_only=True)

# We use a high value to get better results
size_per_epoch = 50700

train = arx[:355000]
train = file_path[:355000]
loss = []
loss_validades = []
epochs = 50

for epoch in range(epochs): #Amount of epochs you want to use
for epoch in range(epochs):
start = 0
end = size_per_epoch
print(f"Començant el epoch {epoch+1}")
print(f"Comenzando el epoch {epoch+1}")

while final < 355000:
while end <= 355000:
X_smiles, X_fasta, y_train = convert(train[start:end])

r = model.fit({'smiles_input': np.array(X_smiles),
'fasta_input': np.array(X_fasta)}, {'output': np.array(y_train)},
'fasta_input': np.array(X_fasta)},
{'output': np.array(y_train)},
validation_data=({'smiles_input': np.array(X_test_smile),
'fasta_input': np.array(X_test_fasta)}, {'output': np.array(T_test_IC50)}), callbacks=[checkpoint], epochs=20, batch_size=64, shuffle=True)
'fasta_input': np.array(X_test_fasta)},
{'output': np.array(T_test_IC50)}),
callbacks=[checkpoint], epochs=1, batch_size=64, shuffle=True)

inici += size_per_epoch
final += size_per_epoch
start += size_per_epoch
end += size_per_epoch

loss.append(r.history["loss"])
loss_validades.append(r.history["val_loss"])
loss.append(np.mean(r.history["loss"]))
loss_validades.append(np.mean(r.history["val_loss"]))

plt.plot(range(epochs), loss, label="loss")
plt.plot(range(epochs), loss_validades, label="val_loss")
plt.legend()
plt.show()


model_cnn()
# Example usage
model_cnn(file_path=file_path)
Loading

0 comments on commit 4b207aa

Please sign in to comment.