From 4b207aa839b2e23e8d93470e6dbc4cd2c03eac8b Mon Sep 17 00:00:00 2001 From: RubenVG02 Date: Sat, 27 Jul 2024 00:55:49 +0200 Subject: [PATCH] Improved code clarity and readability --- src/cnn_affinity.py | 179 ++++++------- src/fix_data_for_models.py | 183 ++++++-------- src/generate_rnn.py | 26 +- src/genetic_algorithm.py | 501 +++++++++++++++++-------------------- src/get_coordinates.py | 25 +- src/pretrained_rnn.py | 56 +++-- src/pymol_3d.py | 51 ++-- 7 files changed, 494 insertions(+), 527 deletions(-) diff --git a/src/cnn_affinity.py b/src/cnn_affinity.py index 5c4b038..0741e25 100644 --- a/src/cnn_affinity.py +++ b/src/cnn_affinity.py @@ -25,152 +25,134 @@ int_smiles = dict(zip(elements_smiles, range(1, len(elements_smiles)+1))) int_fasta = dict(zip(elements_fasta, range(1, len(elements_fasta)+1))) -def convert(arx=file_path): - - #Function to convert all elements (both smiles and fasta) into int, in order to be trained in the model - - smiles_w_numbers = [] # Smiles obtained with int_smiles[1] and the smiles of the df - for i in arx.smiles: - smiles_list = [] - for elements in i: # Elements refers to the elements that make up elements_smile - try: - smiles_list.append(int_smiles[elements]) - except: - pass - while (len(smiles_list) != max_smiles): - smiles_list.append(0) +def convert(file_path=file_path): + + ''' + Function to convert all elements (both smiles and fasta) into int, in order to be trained in the model + + Parameters: + + file_path (path): DataFrame containing the SMILES, FASTA and IC50 columns. Columns must be named "smiles", "sequence" and "IC50". This file is generated from src/fix_data_for_models.py + + Returns: + + smiles_w_numbers (list): List of SMILES converted to integers + fasta_w_numbers (list): List of FASTA converted to integers + + ''' + + smiles_w_numbers = [] + for i in file_path.smiles: + smiles_list = [int_smiles.get(element, 0) for element in i] + smiles_list.extend([0] * (max_smiles - len(smiles_list))) smiles_w_numbers.append(smiles_list) fasta_w_numbers = [] - for i in arx.sequence: - fasta_list = [] - for elements in i: # Elements fa referència a els elements que formen elements_smile - try: - fasta_list.append(int_fasta[elements]) - except: - pass - while (len(fasta_list) != max_fasta): - fasta_list.append(0) + for i in file_path.sequence: + fasta_list = [int_fasta.get(element, 0) for element in i] + fasta_list.extend([0] * (max_fasta - len(fasta_list))) fasta_w_numbers.append(fasta_list) - ic50_numeros = list(arx.IC50) + ic50_numeros = list(file_path.IC50) return smiles_w_numbers, fasta_w_numbers, ic50_numeros -X_test_smile, X_test_fasta, T_test_IC50 = convert(arx[350000:]) +X_test_smile, X_test_fasta, T_test_IC50 = convert(file_path[350000:]) -def model_cnn(): - # model to train - - # kernel regularizer - regulatos = l2(0.001) - - # model per a smiles - smiles_input = Input( - shape=(max_smiles,), dtype='int32', name='smiles_input') - embed = Embedding(input_dim=len( - elements_smiles)+1, input_length=max_smiles, output_dim=128)(smiles_input) - x = Conv1D( - filters=32, kernel_size=3, padding="SAME", input_shape=(50700, max_smiles))(embed) + +def model_cnn(file_path=file_path): + + ''' + Function to train a model using CNN. The model is trained using the SMILES and FASTA sequences. + The model is trained using the IC50 values. + + Parameters: + file_path (path): DataFrame containing the SMILES, FASTA and IC50 columns. Columns must be named "smiles", "sequence" and "IC50". This file is generated from src/fix_data_for_models.py + + ''' + regulator = l2(0.001) + + # Model for SMILES + smiles_input = Input(shape=(max_smiles,), dtype='int32', name='smiles_input') + embed_smiles = Embedding(input_dim=len(elements_smiles)+1, input_length=max_smiles, output_dim=128)(smiles_input) + x = Conv1D(filters=32, kernel_size=3, padding="SAME", kernel_regularizer=regulator)(embed_smiles) x = PReLU()(x) x = Conv1D(filters=64, kernel_size=3, padding="SAME")(x) x = BatchNormalization()(x) x = PReLU()(x) - x = Conv1D( - filters=128, kernel_size=3, padding="SAME")(x) + x = Conv1D(filters=128, kernel_size=3, padding="SAME")(x) x = BatchNormalization()(x) x = PReLU()(x) - pool = GlobalMaxPooling1D()( - x) # maxpool to get a 1d vector + pool_smiles = GlobalMaxPooling1D()(x) - # model per fastas + # Model for FASTA fasta_input = Input(shape=(max_fasta,), name='fasta_input') - embed2 = Embedding(input_dim=len( - elements_fasta)+1, input_length=max_fasta, output_dim=256)(fasta_input) - x2 = Conv1D( - filters=32, kernel_size=3, padding="SAME", input_shape=(50700, max_fasta))(embed2) - x2 = PReLU()(embed2) - - x2 = Conv1D( - filters=64, kernel_size=3, padding="SAME")(x2) + embed_fasta = Embedding(input_dim=len(elements_fasta)+1, input_length=max_fasta, output_dim=256)(fasta_input) + x2 = Conv1D(filters=32, kernel_size=3, padding="SAME")(embed_fasta) + x2 = PReLU()(x2) + + x2 = Conv1D(filters=64, kernel_size=3, padding="SAME")(x2) x2 = BatchNormalization()(x2) x2 = PReLU()(x2) - x2 = Conv1D( - filters=128, kernel_size=3, padding="SAME")(x2) + x2 = Conv1D(filters=128, kernel_size=3, padding="SAME")(x2) x2 = BatchNormalization()(x2) x2 = PReLU()(x2) - pool2 = GlobalMaxPooling1D()( - x2) #maxpool to get a 1d vector - - junt = concatenate(inputs=[pool, pool2]) - - # dense + pool_fasta = GlobalMaxPooling1D()(x2) - de = Dense(units=1024, activation="relu")(junt) - dr = Dropout(0.3)(de) - de = Dense(units=1024, activation="relu")(dr) - dr = Dropout(0.3)(de) - de2 = Dense(units=512, activation="relu")(dr) + # Concatenate and Dense layers + combined = concatenate([pool_smiles, pool_fasta]) + dense = Dense(units=1024, activation="relu")(combined) + dense = Dropout(0.3)(dense) + dense = Dense(units=1024, activation="relu")(dense) + dense = Dropout(0.3)(dense) + dense = Dense(units=512, activation="relu")(dense) - # output + output = Dense(1, activation="relu", name="output")(dense) - output = Dense( - 1, activation="relu", name="output", kernel_initializer="normal")(de2) + model = tf.keras.models.Model(inputs=[smiles_input, fasta_input], outputs=[output]) - model = tf.keras.models.Model( - inputs=[smiles_input, fasta_input], outputs=[output]) - - - # funció per mirar la precisió del model (serà la nostra metric) def r2_score(y_true, y_pred): SS_res = K.sum(K.square(y_true - y_pred)) SS_tot = K.sum(K.square(y_true - K.mean(y_true))) - return (1-SS_res/(SS_tot)+K.epsilon()) + return (1 - SS_res / (SS_tot + K.epsilon())) - model.load_weights( - r"") - # In case you want to continue training a model - model.compile(optimizer="adam", - loss={'output': "mean_squared_logarithmic_error"}, - metrics={'output': r2_score}) - - # To do checkpoints + loss={'output': "mean_squared_logarithmic_error"}, + metrics={'output': r2_score}) + save_model_path = "models/cnn_model.hdf5" - checkpoint = ModelCheckpoint(save_model_path, - monitor='val_loss', - verbose=1, - save_best_only=True) + checkpoint = ModelCheckpoint(save_model_path, monitor='val_loss', verbose=1, save_best_only=True) - # We use a high value to get better results size_per_epoch = 50700 - - train = arx[:355000] + train = file_path[:355000] loss = [] loss_validades = [] epochs = 50 - for epoch in range(epochs): #Amount of epochs you want to use + for epoch in range(epochs): start = 0 end = size_per_epoch - print(f"Començant el epoch {epoch+1}") + print(f"Comenzando el epoch {epoch+1}") - while final < 355000: + while end <= 355000: X_smiles, X_fasta, y_train = convert(train[start:end]) r = model.fit({'smiles_input': np.array(X_smiles), - 'fasta_input': np.array(X_fasta)}, {'output': np.array(y_train)}, + 'fasta_input': np.array(X_fasta)}, + {'output': np.array(y_train)}, validation_data=({'smiles_input': np.array(X_test_smile), - 'fasta_input': np.array(X_test_fasta)}, {'output': np.array(T_test_IC50)}), callbacks=[checkpoint], epochs=20, batch_size=64, shuffle=True) + 'fasta_input': np.array(X_test_fasta)}, + {'output': np.array(T_test_IC50)}), + callbacks=[checkpoint], epochs=1, batch_size=64, shuffle=True) - inici += size_per_epoch - final += size_per_epoch + start += size_per_epoch + end += size_per_epoch - loss.append(r.history["loss"]) - loss_validades.append(r.history["val_loss"]) + loss.append(np.mean(r.history["loss"])) + loss_validades.append(np.mean(r.history["val_loss"])) plt.plot(range(epochs), loss, label="loss") plt.plot(range(epochs), loss_validades, label="val_loss") @@ -178,4 +160,5 @@ def r2_score(y_true, y_pred): plt.show() -model_cnn() +# Example usage +model_cnn(file_path=file_path) \ No newline at end of file diff --git a/src/fix_data_for_models.py b/src/fix_data_for_models.py index da2fb6d..bd4b660 100644 --- a/src/fix_data_for_models.py +++ b/src/fix_data_for_models.py @@ -2,120 +2,93 @@ from chembl_webresource_client.new_client import new_client import csv +def get_target_smiles(target_id, output_csv="data1"): + """ + Retrieve all SMILES for a given target and store the data in a CSV file. -def selection_mol(id, name_file="data1", name_df="df"): - ''' - Function to obtain all the smiles of a given target, and store them into a csv file - Parameters: - -id: CHEMBL ID - -name_file: Name of the file (by default, "data1") - -name_df: Name df (by default, "df") - ''' - act = new_client.activity - # IC50: amount of inhibitor subs you need to inhibit at 50% - res = act.filter(target_chembl_id=id).filter(standard_type="IC50") - name_df = pd.DataFrame.from_dict(res) - name_df.to_csv(f"{name_file}.csv", index_label=False) - return name_df - - -def clean_data_rnn(name_file="drugs", name_file2="dades_netes"): - ''' - Function to clean the data and store it into a csv file, which will be ready to be used for the RNN model + target_id (str): The CHEMBL ID of the target. + output_csv (str): Name of the output CSV file (without extension). - Parameters: - -name_file: input file name - -name_file2: name of the net archive - ''' - arx = pd.read_csv(f"{name_file}.csv", sep=";", index_col=False) - datfr = arx[arx["Standard Value"].notna()] - datfr = datfr[arx.Smiles.notna()] - df_no_dup = datfr.drop_duplicates(['Smiles']) - selecc = ['Molecule ChEMBL ID', 'Smiles', 'Standard Value'] - df2 = df_no_dup[selecc] - print(df2.value_counts()) - - # Standard Value smaller than 0 are deleted - df2 = df2[df2["Standard Value"] > 0] - df2 = df2.reset_index(drop=True) - # This is the - df2.to_csv(f"{name_file2}.csv", index=False, sep=",") + Returns: + pd.DataFrame: DataFrame containing the retrieved data. + """ + activity_client = new_client.activity + results = activity_client.filter(target_chembl_id=target_id).filter(standard_type="IC50") + df = pd.DataFrame.from_dict(results) + df.to_csv(f"{output_csv}.csv", index=False) + return df - # Quants elements utilitzes per entrenar el model - print(df2.value_counts()) - return f"{name_file2}.csv" +def clean_and_prepare_data(input_csv="drugs", output_csv="cleaned_data"): + """ + Clean the data and prepare it for the RNN model, saving the result to a CSV file. -def obtain_smiles(origin_file="500k_dades", destination_txt="smiles_22", separator=","): - ''' - Parameters: - -origin_file: file from which we will obtain the smiles - -destination_file: file where the smiles will be saved - -separator: separator of the file (by default, ",") - ''' - dades = pd.read_csv(f"{origin_file}.csv", sep=separator,low_memory=False) - llista_smiles = dades["Smiles"].unique() - with open(f"{destination_txt}.txt", "w") as f: - for line in llista_smiles: - f.write(str(line) + "\n") - - -def neteja_dades_afinitat(name_file="inh", destination_file="cnn_file_fixed", col_smiles="Ligand SMILES", col_ic50="IC50 (nM)", col_seq="BindingDB Target Chain Sequence"): - ''' - Parameters: - -name_file: Name of the source file to modify, in tsv format - -target_name: Name of the target file to be created - -col_smiles: Columns from which you want to get the smiles - -col_ic50: Columns from which to obtain the ic50 - -col_seq: Columns from which to obtain the sequences - - ''' - with open(f"{name_file}.tsv", "r", encoding="utf8") as file: - df = pd.read_csv(file, sep="\t", on_bad_lines="skip", - low_memory=False, nrows=1000000) - columna_cadena = df[col_seq] - columna_50 = df[col_ic50] - columna_smiles = df[col_smiles] + Parameters: + input_csv (str): Name of the input CSV file (without extension). + output_csv (str): Name of the output CSV file (without extension). + + Returns: + str: Name of the output CSV file. + """ + df = pd.read_csv(f"{input_csv}.csv", sep=";", index_col=False) + df_clean = df.dropna(subset=["Standard Value", "Smiles"]) + df_clean = df_clean.drop_duplicates(subset=["Smiles"]) + df_clean = df_clean[df_clean["Standard Value"] > 0] + df_clean = df_clean.reset_index(drop=True) + + columns_to_keep = ['Molecule ChEMBL ID', 'Smiles', 'Standard Value'] + df_final = df_clean[columns_to_keep] + df_final.to_csv(f"{output_csv}.csv", index=False, sep=",") + + print(f"Cleaned data: {df_final.shape[0]} records.") + return f"{output_csv}.csv" - cadena = [] - for i in columna_cadena: - cadena.append(i) - ic50 = [] - for i in columna_50: - ic50.append(i) - smiles = [] - for i in columna_smiles: - smiles.append(i) - headers = ["Smiles", "IC50", "sequence"] - listas = [smiles, ic50, cadena] - with open(f"{destination_file}.csv", "w", encoding="utf8") as archivo: - write = csv.writer(archivo) - write.writerow(headers) - write.writerows(zip(*listas)) - arx = pd.read_csv(f"{destination_file}.csv", sep=",") - arx["IC50"] = arx["IC50"].str.strip() - arx = arx.dropna(how="any").reset_index(drop=True) - # print(datfr) - df_no_dup = arx.drop_duplicates(['smiles']) - df_no_dup["IC50"] = df_no_dup["IC50"].str.replace(r"[<>]", "", regex=True) - df_no_dup["IC50"] = df_no_dup["IC50"].astype(float) - df_no_dup = df_no_dup[df_no_dup["IC50"] < 1000000] - df_no_dup = df_no_dup[df_no_dup["Smiles"].str.len() < 100] - df_no_dup = df_no_dup[df_no_dup["sequence"].str.len() < 5000] - #smiles_without_symbol = [] - ''' for i in df_no_dup["smiles"]: - smiles_sense_simbols.append(i.replace("@", "").replace("\\", - "").replace("/", "").replace(".", ""))''' +def extract_smiles_from_csv(input_csv="500k_dades", output_txt="smiles_list", separator=","): + """ + Extract SMILES from a CSV file and save them to a text file. - '''df_no_dup["smiles"] = smiles_sense_simbols - del smiles_sense_simbols''' - df_no_dup["sequence"] = df_no_dup["sequence"].apply( - lambda x: x.upper()) - df_no_dup = df_no_dup.sample(frac=1).reset_index(drop=True) - df_no_dup.to_csv(f"{destination_file}.csv", index=False, sep=",") - return f"{destination_file}.csv" + Parameters: + input_csv (str): Name of the input CSV file (without extension). + output_txt (str): Name of the output text file (without extension). + separator (str): Separator used in the CSV file. + """ + df = pd.read_csv(f"{input_csv}.csv", sep=separator, low_memory=False) + unique_smiles = df["Smiles"].unique() + with open(f"{output_txt}.txt", "w") as file: + for smile in unique_smiles: + file.write(f"{smile}\n") + +def clean_affinity_data(input_tsv="inh", output_csv="cleaned_affinity_data", col_smiles="Ligand SMILES", col_ic50="IC50 (nM)", col_seq="BindingDB Target Chain Sequence"): + """ + Clean affinity data and save it to a CSV file. + Parameters: + input_tsv (str): Name of the input TSV file (without extension). + output_csv (str): Name of the output CSV file (without extension). + col_smiles (str): Name of the column containing SMILES. + col_ic50 (str): Name of the column containing IC50 values. + col_seq (str): Name of the column containing sequences. + + Returns: + str: Name of the output CSV file. + """ + df = pd.read_csv(f"{input_tsv}.tsv", sep="\t", on_bad_lines="skip", low_memory=False) + + df_clean = df[[col_smiles, col_ic50, col_seq]].dropna() + df_clean.columns = ["Smiles", "IC50", "sequence"] + + df_clean["IC50"] = df_clean["IC50"].str.strip().str.replace(r"[<>]", "", regex=True).astype(float) + df_clean = df_clean[(df_clean["IC50"] > 0) & (df_clean["IC50"] < 1000000)] + df_clean = df_clean[df_clean["Smiles"].str.len() < 100] + df_clean = df_clean[df_clean["sequence"].str.len() < 5000] + df_clean["sequence"] = df_clean["sequence"].str.upper() + + df_clean = df_clean.drop_duplicates(subset=["Smiles"]) + df_clean = df_clean.sample(frac=1).reset_index(drop=True) + + df_clean.to_csv(f"{output_csv}.csv", index=False, sep=",") + return f"{output_csv}.csv" diff --git a/src/generate_rnn.py b/src/generate_rnn.py index e60829b..2604463 100644 --- a/src/generate_rnn.py +++ b/src/generate_rnn.py @@ -12,12 +12,11 @@ data = open(r"samples\txt_files\98k.txt").read() -# to get the unique data elements to integers using a dictionary -# so we associate a numerical value to each letter +# To get the unique data elements to integers using a dictionary elements_smiles = {u: i for i, u in enumerate(sorted(set(data)))} elements_smiles.update({-1: "\n"}) -# to pass the numeric elements to smile elements +# To pass the numeric elements to smile elements int_2_elements = {i: u for i, u in enumerate(sorted(set(data)))} int_2_elements.update({"\n": -1}) @@ -62,20 +61,20 @@ def split_input_target(chunk, values=map_char): Dropout(0.15), Dense(map_char, activation="softmax")]) -#We can modify the model to add more layers or change the number of neurons in each layer -#We can also change the optimizer, the loss function and the metrics -#Depending on the number of different elements in your smile sequence, map_char can be changed, and you can also change it manually depending on your df +# We can modify the model to add more layers or change the number of neurons in each layer +# We can also change the optimizer, the loss function and the metrics +# Depending on the number of different elements in your smile sequence, map_char can be changed, and you can also change it manually depending on your df model.load_weights( r"models\definitive_models\rnn_model.hdf5") -#This is used to continue training a model that has already been trained +# This is used to continue training a model that has already been trained model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]) -#Different loss functions can be used, but I reccomend categorical_crossentropy +# Different loss functions can be used, but I reccomend categorical_crossentropy -filepath = "" #Path to save the model -checkpoint = ModelCheckpoint(filepath=filepath, +checkpoint_path = "path_to_save_model" +checkpoint = ModelCheckpoint(filepath=checkpoint_path, monitor='loss', verbose=1, save_best_only=True, mode='min') @@ -83,7 +82,10 @@ def split_input_target(chunk, values=map_char): r = model.fit(dataset, epochs=100, callbacks=callbacks_list, batch_size=128) -plt.plot(r.history["accuracy"], label="accuracy") +plt.plot(r.r["accuracy"], label="accuracy") +plt.title("Model Accuracy") +plt.xlabel("Epochs") +plt.ylabel("Accuracy") plt.legend() +plt.savefig("accuracy_plot.png") plt.show() -plt.savefig("acc.png") diff --git a/src/genetic_algorithm.py b/src/genetic_algorithm.py index c7e3e8d..3899998 100644 --- a/src/genetic_algorithm.py +++ b/src/genetic_algorithm.py @@ -1,84 +1,89 @@ -from rdkit import Chem -from rdkit.Chem import Draw -from rdkit.Chem.rdchem import RWMol -from rdkit.Chem import Draw -from rdkit.Chem import Descriptors - - -import random +from rdkit import Chem +from rdkit.Chem import Draw, Descriptors, RWMol +import random import csv +import numpy as np +import pandas as pd +import time from affinity_with_target_and_generator import find_candidates from check_affinity import calculate_affinity - def select_parents(initial_population, target, bests=2): - - ''' - Function to select the parents of the first generations of the genetic algorithm. The parents are the molecules with the best affinity to the target. + """ + Select the parents for the first generations of the genetic algorithm. + The parents are the molecules with the best affinity to the target. Parameters: - -initial_population: Path of the file with the initial population of molecules. If it is not specified, the function will use the function find_candidates to obtain the initial population. - -target: Sequence of the target in FASTA format. - -bests: Number of best molecules to select as parents. By default, 2. + initial_population (str or list): Path of the file with the initial population of molecules + or a list of SMILES strings. + target (str): Sequence of the target in FASTA format. + bests (int, optional): Number of best molecules to select as parents. Defaults to 2. - ''' + Returns: + list of tuples: List of tuples containing the best molecules and their affinities. + """ + + def load_population(file_path): + + """ + Load a list of SMILES strings from a CSV file. + + Parameters: + file_path (str): Path to the CSV file. + + Returns: + list of str: List of SMILES strings. + """ + + with open(file_path, "r") as file: + reader = csv.reader(file) + return [row[0] for row in reader][1:] + + def score_population(population): + + """ + Score a population of SMILES strings based on their affinity to the target. + + Parameters: + population (list of str): List of SMILES strings. + + Returns: + list of tuples: List of tuples containing SMILES strings and their affinities, sorted by affinity. + """ + + scores = [calculate_affinity(smile=i, fasta=target) for i in population] + return sorted(zip(population, scores), key=lambda x: x[1]) + if "generate" in initial_population: - find_candidates(max_molecules=5,db_smiles=True,target=target,draw_minor=False,generate_qr=False,upload_to_mega=False, arx_db=r"generated_molecules/generated_molecules.txt", accepted_value=3000, name_file_destination="generated_w_algo") #Already created txt file of smiles, so db_smiles=True. If not, db_smiles=False - with open("generated_w_algo.csv", "r") as file: - reader = csv.reader(file) - initial_population = [row[0] for row in reader][1:] - score=[] - for i in initial_population: - i=i.replace("@", "").replace("/", "") - value=calculate_affinity(smile=i, fasta=target) - score.append(value) - total=zip(initial_population, score) - total=sorted(total, key=lambda x: x[1]) - if ".csv" in initial_population: - with open(initial_population, "r") as file: - reader = csv.reader(file) - initial_population = [row[0] for row in reader][1:] - score=[] - for i in initial_population: - i=i.replace("@", "").replace("/", "") - value=calculate_affinity(smile=i, fasta=target) - score.append(value) - total=zip(initial_population, score) - total=sorted(total, key=lambda x: x[1]) - if ".txt" in initial_population: - with open(initial_population, "r") as file: - score=[] - for row in file: - row=row.replace("@", "").replace("/", "") - value=calculate_affinity(smile=row, fasta=target) - score.append(value) - total=zip(initial_population, score) - total=sorted(total, key=lambda x: x[1]) - else: #if the initial population is a list - score=[] - for i in initial_population: - i=i.replace("@", "").replace("/", "") - value=calculate_affinity(smile=i, fasta=target) - score.append(value) - total=zip(initial_population, score) - total=sorted(total, key=lambda x: x[1]) - try: - parents=total[:bests] - except: - parents=total[:len(total)] - return parents - + find_candidates(max_molecules=5, db_smiles=True, target=target, draw_minor=False, generate_qr=False, + upload_to_mega=False, arx_db=r"generated_molecules/generated_molecules.txt", + accepted_value=3000, name_file_destination="generated_w_algo") + initial_population = load_population("generated_w_algo.csv") + + elif ".csv" in initial_population or ".txt" in initial_population: + initial_population = load_population(initial_population) + + else: + initial_population = [i.replace("@", "").replace("/", "") for i in initial_population] + + total = score_population(initial_population) + return total[:bests] if total else [] + def check_druglikeness(smile): - ''' - Function to check if a molecule is druglike. If not, it will be removed from the population. + + """ + Check if a molecule is druglike based on its SMILES string. Parameters: - - smile: Molecule structure in SMILE format. + smile (str): Molecule structure in SMILES format. + + Returns: + bool: True if the molecule is druglike, False otherwise. + """ - ''' mol = Chem.MolFromSmiles(smile) if mol: return (Descriptors.ExactMolWt(mol) < 500 and @@ -86,245 +91,209 @@ def check_druglikeness(smile): Descriptors.NumHDonors(mol) < 5 and Descriptors.NumHAcceptors(mol) < 10) return False - -def childs(*parents, cant=0): - ''' - Function to cross two smile sequences in order to obtain two new molecules. Function used when ic50 value does not improve during the generations. - + + +def generate_children(parent1, parent2, mutation_attempts=10): + + """ + Generate two new molecules by crossing two parent SMILES sequences. + Parameters: - -parents: Sequences of the molecules in smile format obtained from the parents generation. - -cant: Number of times the function has been called without forming childs. If it is called 10 times, it will return the parents without creating new molecules. - ''' - parents1=[] - for i in parents: - parents1.append(i) - if len(parents1[1])>len(parents1[0]): - crossover_point=random.randint(0, len(parents[0])-1) - else: - crossover_point=random.randint(2, len(parents[1])-1) - child1 = parents[0][:crossover_point] + parents[1][crossover_point:] - child2 = parents[1][:crossover_point] + parents[0][crossover_point:] - print(child1, child2) - - if Chem.MolFromSmiles(child1) is not None and Chem.MolFromSmiles(child2) is not None: + parent1 (str): SMILES sequence of the first parent. + parent2 (str): SMILES sequence of the second parent. + mutation_attempts (int): Number of attempts to generate valid children. Defaults to 10. + + Returns: + tuple of str: Tuple of two child SMILES sequences or the original parents if valid children cannot be created. + """ + + def crossover(smile1, smile2): + crossover_point = random.randint(1, min(len(smile1), len(smile2)) - 1) + child1 = smile1[:crossover_point] + smile2[crossover_point:] + child2 = smile2[:crossover_point] + smile1[crossover_point:] return child1, child2 - else: - if cant==10: - print("No childs were created") - return parents - childs(*parents) - cant+=1 - -def mutations(smile, mutation_rate=0.1): - ''' - Function to mutate a molecule in order to obtain a new molecule with a better affinity to the target. + child1, child2 = crossover(parent1, parent2) + if Chem.MolFromSmiles(child1) and Chem.MolFromSmiles(child2): + return child1, child2 - Parameters: - -smile: SMILE string of the molecule obtained from parents - -mutation_rate: Probability of mutation for each atom in the molecule. By default, 0.1 + if mutation_attempts > 0: + return generate_children(parent1, parent2, mutation_attempts - 1) + + print("Failed to generate valid children; returning original parents.") + return parent1, parent2 - ''' - atoms= [6, 5, 7, 15, 8, 16, 9, 17, 35, 53] +def mutate(smile, mutation_rate=0.1): - aromatic_atoms=[6, 7, 15, 8, 16] + """ + Mutate a molecule to obtain new molecule structures. - len_smile=Chem.MolFromSmiles(smile).GetNumAtoms() + Parameters: + smile (str): SMILES string of the molecule. + mutation_rate (float): Probability of mutation for each atom in the molecule. Defaults to 0.1. - copy=smile #We copy the molecule in order to not modify the original molecule + Returns: + list of str: List of mutated SMILES strings. + """ + atoms = [6, 5, 7, 15, 8, 16, 9, 17, 35, 53] + aromatic_atoms = [6, 7, 15, 8, 16] - mol1=Chem.MolFromSmiles(copy) - Chem.SanitizeMol(mol1) - Chem.Kekulize(mol1) - mol1=RWMol(mol1) + mol = Chem.MolFromSmiles(smile) + if not mol: + return [] - error=0 #if the function does not generate valid molecules 10 times, it will select new parents - - child_generated=[] - while len(child_generated)<5 and error<4000: - for molecule in copy: - if random.uniform(0,1) <= mutation_rate: - atom_to_remove=np.random.randint(0, len_smile) #random atom to remove from the molecule - try: - valence_atom=mol1.GetAtomWithIdx(atom_to_remove).GetTotalValence() #valence of the atom to remove - print(valence_atom) - error+=1 - except: - pass - if mol1.GetAtomWithIdx(atom_to_remove).GetIsAromatic(): - if valence_atom==2: - mol1.ReplaceAtom(atom_to_remove, Chem.Atom(aromatic_atoms[np.random.randint(1, 5)])) - elif valence_atom==3: - mol1.ReplaceAtom(atom_to_remove, Chem.Atom(aromatic_atoms[np.random.randint(0, 3)])) + mutated_smiles = set() + for _ in range(5): + try: + mol = Chem.MolFromSmiles(smile) + rw_mol = RWMol(mol) + num_atoms = rw_mol.GetNumAtoms() + for atom_idx in range(num_atoms): + if random.uniform(0, 1) <= mutation_rate: + atom = rw_mol.GetAtomWithIdx(atom_idx) + valence = atom.GetTotalValence() + if atom.GetIsAromatic(): + if valence == 2: + rw_mol.ReplaceAtom(atom_idx, Chem.Atom(random.choice(aromatic_atoms[1:]))) + elif valence == 3: + rw_mol.ReplaceAtom(atom_idx, Chem.Atom(random.choice(aromatic_atoms[:3]))) else: - continue - mol1.GetAtomWithIdx(atom_to_remove).SetIsAromatic(True) - else: - if valence_atom==1: - mol1.ReplaceAtom(atom_to_remove, Chem.Atom(atoms[np.random.randint(0, 10)])) - elif valence_atom==2: - mol1.ReplaceAtom(atom_to_remove, Chem.Atom(atoms[np.random.randint(0, 6)])) - elif valence_atom==3: - mol1.ReplaceAtom(atom_to_remove, Chem.Atom(atoms[np.random.randint(0, 4)])) - elif valence_atom==4: - mol1.ReplaceAtom(atom_to_remove, Chem.Atom(atoms[np.random.randint(0, 1)])) - try: - Chem.SanitizeMol(mol1) - Chem.Kekulize(mol1) - except Chem.rdchem.KekulizeException: - pass - if Chem.MolToSmiles(mol1): - if Chem.MolToSmiles(mol1) not in child_generated and check_druglikeness(Chem.MolToSmiles(mol1)): - child_generated.append(Chem.MolToSmiles(mol1)) - else: - break - else: - break + rw_mol.ReplaceAtom(atom_idx, Chem.Atom(random.choice(atoms[:4]))) + Chem.SanitizeMol(rw_mol) + mutated_smile = Chem.MolToSmiles(rw_mol) + if check_druglikeness(mutated_smile): + mutated_smiles.add(mutated_smile) + except Exception: + pass + + return list(mutated_smiles) + - return child_generated - -def file_preparation(file_path, headers=[]): +def prepare_file(file_path, headers=[]): ''' - Function to create the .csv file to which the obtained data will be added - + Create a CSV file with specified headers. + Parameters: - -headers: Names of the columns we want to use + file_path (str): Path to the file. + headers (list of str): List of column names. ''' with open(f"{file_path}.csv", "w", newline="") as file: writer = csv.writer(file) - writer.writerow(headers) + writer.writerow(headers) -def genetic_algorithm(target="", initial_pop_path=r"", objective_ic50=20, generations=100, bests=2, path_save=r"", save_since=20, name_file_best=r"", name_img="result", initial=r"", name_img_initial=""): +def genetic_algorithm(target="", initial_pop_path="", objective_ic50=20, generations=100, bests=2, + path_save="", save_since=20, name_file_best="", name_img="result", initial="", name_img_initial=""): ''' - Function to find the best molecule to bind to a target protein using a genetic algorithm. + Find the best molecule to bind to a target protein using a genetic algorithm. Parameters: - -target: Sequence of the target protein in fasta format. - -initial_pop_path: Path of the initial population of smile molecules. If this file does not exist, the function will create it. - -objective_ic50: Value of the affinity of the target protein that we want to obtain. By default, it is 20. - -generations: Number of generations of the genetic algorithm. By default, it is 100. - -bests: Number of best molecules that we want to select from each generation. By default, it is 5. - -path_save: Path where we want to save the best molecule obtained in each generation. By default, it is the current directory. - -save_since: Since which ic50 value we want to save the best molecule obtained in each generation. By default, it is 20. - -name_file: Name of the file where we want to save the best molecule obtained depending on best and save_since. By default, it is "resultados.csv". - -name_img: Name of the best molecule obtained in the last generation to be represented. By default, it is "result". - -initial: Name of the file where we want to save the best molecule obtained in the first generation to compare at the end with the generated ones. By default, it is "initial". + target (str): Sequence of the target protein in FASTA format. + initial_pop_path (str): Path of the initial population of SMILES molecules. + objective_ic50 (float): Desired IC50 value. Defaults to 20. + generations (int): Number of generations. Defaults to 100. + bests (int): Number of best molecules to select. Defaults to 2. + path_save (str): Path to save the best molecules. + save_since (float): IC50 value threshold to save molecules. Defaults to 20. + name_file_best (str): File name to save the best molecules. + name_img (str): Image file name for the best molecule. + initial (str): File name for the best molecule from the initial generation. + name_img_initial (str): Image file name for the initial best molecule. ''' - parents=select_parents(initial_population=initial_pop_path, target=target, bests=bests) #We select the best molecules from the initial population + parents = select_parents(initial_population=initial_pop_path, target=target, bests=bests) + + best_parent = min(parents, key=lambda x: x[1]) + prepare_file(file_path=path_save, headers=["SMILE", "Affinity"]) + with open(f"{initial}.csv", "a", newline="") as file: writer = csv.writer(file) - best_parent= min(parents, key=lambda x: x[1]) - writer.writerow([best_parent[0], best_parent[1]]) #We save the best parent to compare at the end - Draw.MolToImageFile(Chem.MolFromSmiles(best_parent[0]), filename=fr"examples/best_molecule_{name_img_initial}.jpg", - size=(400, 300)) - file_preparation(file_path=path_save, headers=["SMILE", "Affinity"]) - - all_bests=[] #We create a list to save the best molecules obtained in each generation in order to compare if we need to use the childs function - sum_not_improve=0 #We create a variable to count the number of generations in which the best molecule has not improved - smiles_saved=[] #list to save the best 2 molecules in case we need to use the childs function - best_generated=tuple() #We create a tuple to save the best molecule obtained overall + writer.writerow([best_parent[0], best_parent[1]]) + Draw.MolToImageFile(Chem.MolFromSmiles(best_parent[0]), filename=f"examples/best_molecule_{name_img_initial}.jpg", size=(400, 300)) + + all_bests = [] + sum_not_improve = 0 + smiles_saved = [] + best_generated = (best_parent[0], best_parent[1]) + for gen in range(generations): - new_generation=[] - total=[] - if sum_not_improve >= 6: #If the best molecule has not improved for x generations, we use the childs function - print("Using childs function") - parents= childs(smiles_saved[0], smiles_saved[1]) - if parents!=None: - print("\n\n\n") - print("The best molecules have been recombined, the new parents are: ", parents) - print("\n\n\n") - else: - print("The best molecules can't be recombined, the same parents will be used: ", parents) - time.sleep(4) - sum_not_improve=0 + new_generation = [] + + if sum_not_improve >= 6: + print("Using child generation due to stagnation.") + parents = generate_children(smiles_saved[0], smiles_saved[1]) + sum_not_improve = 0 else: - parents=[i[0] for i in parents] - for i in parents: - mutation=mutations(smile=i, mutation_rate=0.1) - new_generation.extend(mutation) + parents = [p[0] for p in parents] + + for parent in parents: + new_generation.extend(mutate(smile=parent, mutation_rate=0.1)) - score=[] - print(new_generation) + score = [calculate_affinity(smile=smile, fasta=target) for smile in new_generation] + total = sorted(zip(new_generation, score), key=lambda x: x[1]) - for smile in new_generation: - smile=str(smile).replace("@", "").replace("\\","").replace("/", "").replace(".", "") - value=calculate_affinity(smile=smile, fasta=target) - score.append(value) - total=zip(new_generation, score) - total=sorted(total, key=lambda x: x[1]) with open(f"{path_save}.csv", "a") as file: - for i in total: - if i[1] <= save_since: - if i[0] not in pd.read_csv(f"{path_save}.csv").SMILE.tolist(): - file.write(f"{i[0]}, {i[1]}\n") - if compare_ic50(list_score=total, objective_ic50=objective_ic50) is not False or gen==generations-1: - best_individual, affinity= compare_ic50(list_score=total, objective_ic50=objective_ic50) - print("\n\n\n") - print("--------") - print("Generation:", gen+1) - if gen==generations-1: - print("Best SMILE sequence obtained:", best_generated[0]) #best_individual - print("IC50 value:", best_generated[1]) #affinity - else: - print("Best SMILE sequence obtained:", best_individual) - print("IC50 value:", affinity) - print("--------") - molecule = Chem.MolFromSmiles(best_individual) - Draw.MolToImageFile(molecule, filename=fr"examples/best_molecule_{name_img}.jpg", - size=(400, 300)) - with open(f"{name_file_best}.csv","a") as file: - file_preparation(file_path=f"{name_file_best}", headers=["SMILE", "Affinity"]) - file.write(f"{best_individual}, {affinity}\n") + for item in total: + if item[1] <= save_since: + if item[0] not in pd.read_csv(f"{path_save}.csv").SMILE.tolist(): + file.write(f"{item[0]}, {item[1]}\n") + + best = compare_ic50(list_score=total, objective_ic50=objective_ic50) + if best or gen == generations - 1: + best_individual, affinity = best if best else (None, None) + print(f"Generation: {gen + 1}") + print(f"Best SMILE sequence obtained: {best_individual}") + print(f"IC50 value: {affinity}") + + if best_individual: + molecule = Chem.MolFromSmiles(best_individual) + Draw.MolToImageFile(molecule, filename=f"examples/best_molecule_{name_img}.jpg", size=(400, 300)) + prepare_file(file_path=name_file_best, headers=["SMILE", "Affinity"]) + with open(f"{name_file_best}.csv", "a") as file: + file.write(f"{best_individual}, {affinity}\n") break + + parents = total[:bests] + + if gen == 0: + all_bests.append(float(parents[0][1])) + min_ic50 = min(all_bests) + best_generated = (parents[0][0], parents[0][1]) else: - parents=total[:bests] - if gen==0: + min_ic50 = min(all_bests) + if float(parents[0][1]) >= min_ic50: + smiles_saved = [x[0] for x in total] + smiles_saved = [smiles_saved[0], smiles_saved[random.randint(1, len(smiles_saved) - 1)]] + sum_not_improve += 1 + else: + if best_generated[1] > parents[0][1]: + best_generated = (parents[0][0], parents[0][1]) + sum_not_improve = 0 + if parents[0][1] not in all_bests: all_bests.append(float(parents[0][1])) - minumum_ic50=min(all_bests) - best_generated=(parents[0][0], parents[0][1]) - elif gen != 0: - minumum_ic50=min(all_bests) - if float(parents[0][1])>= minumum_ic50: - smiles_saved=[x[0] for x in total] - smiles_saved=[smiles_saved[0], smiles_saved[random.randint(1, len(smiles_saved)-1)]] - print("Saved smiles:", smiles_saved) - sum_not_improve+=1 - - else: - if best_generated[1] > parents[0][1]: - best_generated=(parents[0][0], parents[0][1]) - sum_not_improve=0 - if parents[0][1] not in all_bests: - all_bests.append(float(parents[0][1])) - - print("\n\n\n" + "Generation:", gen+1) - print("Best SMILE sequence obtained this generation:", parents[0][0]) - if parents[0][0] != best_generated[0]: - print("Best SMILE sequence obtained overall:", best_generated[0]) - print("IC50 value:", parents[0][1]) - if float(parents[0][1])>= minumum_ic50 and gen != 0: - print("Value not improved") - print("\n\n\n") - time.sleep(4) - continue - + print(f"Generation: {gen + 1}") + print(f"Best SMILE sequence obtained this generation: {parents[0][0]}") + if parents[0][0] != best_generated[0]: + print(f"Best SMILE sequence obtained overall: {best_generated[0]}") + print(f"IC50 value: {parents[0][1]}") + if float(parents[0][1]) >= min_ic50: + print("Value not improved") + time.sleep(4) + def compare_ic50(list_score, objective_ic50): ''' - Function to compare the affinity of the molecules with the objective affinity. + Compare the affinity of molecules with the objective IC50 value. Parameters: - -list_score: List of the affinity of the molecules. - -objective_ic50: Value of the affinity of the target protein that we want to obtain. - ''' - for i in list_score: - if i[1] <= objective_ic50: - return i[0], i[1] - else: - return False - - + list_score (list of tuples): List of tuples containing SMILES and affinity values. + objective_ic50 (float): Desired IC50 value. + Returns: + tuple of str or bool: Tuple of the best SMILES sequence and its affinity if the objective is met, otherwise False. + ''' + for item in list_score: + if item[1] <= objective_ic50: + return item[0], item[1] + return False diff --git a/src/get_coordinates.py b/src/get_coordinates.py index 83956a4..8159e78 100644 --- a/src/get_coordinates.py +++ b/src/get_coordinates.py @@ -2,20 +2,27 @@ def obtain_sdf(smile="", name="molecule_sdf"): """ - Function to obtain sdf file of an smile sequence in order to use it on PyMol - + Obtain the SDF file of a molecule given its SMILES sequence. + Parameters: - -smile: Sequence of the molecule in smile format. - -name: Name of the sdf file. + smile (str): SMILES sequence of the molecule. + name (str): Name of the output SDF file (without extension). + + Returns: + str: Message indicating the result of the file creation. """ - request=requests.get(f"https://cactus.nci.nih.gov/chemical/structure/{smile}/file?format=sdf&get3d=True") - sdf_file=request.text - print(sdf_file) + + url = f"https://cactus.nci.nih.gov/chemical/structure/{smile}/file?format=sdf&get3d=True" + + response = requests.get(url) + + sdf_file = response.text + if "Page not found" in sdf_file: print("SDF file not found") return "SDF file not found" else: - with open(f"{name}.sdf","w") as f: + with open(f"{name}.sdf", "w") as f: f.write(sdf_file) print("SDF file created") - + return "SDF file created" diff --git a/src/pretrained_rnn.py b/src/pretrained_rnn.py index 1ec4af7..e460a5f 100644 --- a/src/pretrained_rnn.py +++ b/src/pretrained_rnn.py @@ -26,39 +26,58 @@ def generator(path_model=r"models\definitive_models\rnn_model.hdf5", path_data=r"samples/txt_files/98k.txt", number_generated=100, img_druglike=True, path_destination_molecules=r""): ''' - Parameters: - -path_model: Path where the already trained model is located - -path_data: Path where the data used is located (path model and path data must have the same dimensions/same amount of different elements) - -number_generated: Number of molecules that are generated, by default 100 - -img_druglike: To generate .jpg images of the generated drug-like molecules (they will be ordered according to epoch) (By default True) - -Path_destination_molecules: Destination path of the generated SMILE sequences + Generate molecules using a pre-trained RNN model and evaluate if they are drug-like. + + Parameters: + path_model (str): Path to the pre-trained model file. + path_data (str): Path to the text file used for training the model. + number_generated (int): Number of molecules to generate (default is 100). + img_druglike (bool): Whether to generate images of drug-like molecules (default is True). + path_destination_molecules (str): Path to save the generated SMILES sequences. + + Returns: + list: List of generated SMILES sequences. + ''' - def split_input_target(valors): - input_text = valors[:-1] - target_idx = valors[-1] + def split_input_target(values): + + """ + Split the input and target for training. + + Parameters: + values (np.ndarray): Array of input and target values. + + Returns: + tuple: Input and target tensors. + """ + + input_text = values[:-1] + target_idx = values[-1] target = tf.one_hot(target_idx, depth=map_char) #depth must be equal to the number of different outputs of the pre-trained model target = tf.reshape(target, [-1]) return input_text, target def create_seed(max_molecules=137): ''' - Function that takes your ds and allows you to obtain a seed from it to generate molecules + Create a seed pattern for molecule generation. Parameters: - -max_molecules: Maximum length you want your pattern/seed to have, by default, 137 + max_molecules (int): Maximum length of the seed pattern (default is 137). + Returns: + np.ndarray: Seed pattern array. ''' - generador_seeds=tfds.as_numpy(dataset.take(random.randint(0, len(dades))).take(1)) + generador_seeds=tfds.as_numpy(dataset.take(random.randint(0, len(data))).take(1)) for a, b in enumerate(generador_seeds): break pattern=b[0][np.random.randint(0,max_molecules)] return pattern with open(f"{path_data}") as f: - dades = "\n".join(line.strip() for line in f) + data = "\n".join(line.strip() for line in f) - elements_smiles = {u: i for i, u in enumerate(sorted(set(dades)))} + elements_smiles = {u: i for i, u in enumerate(sorted(set(data)))} elements_smiles.update({-1: "\n"}) @@ -70,7 +89,7 @@ def create_seed(max_molecules=137): - slices = np.array([[elements_smiles[c]] for c in dades]) + slices = np.array([[elements_smiles[c]] for c in data]) # Create training examples / targets char_dataset = tf.data.Dataset.from_tensor_slices(slices) @@ -155,5 +174,8 @@ def create_model(): return total_smiles #Example of use -generator(path_model=r"models\specific_RNN_models\modelo_rnn_insectos.hdf5", path_data=r"samples\txt_files\insectos.txt", - number_generated=100, img_druglike=True, path_destination_molecules=r"examples/generated_molecules/generated_molecules.txt") +generator(path_model=r"models\specific_RNN_models\modelo_rnn_insectos.hdf5", + path_data=r"samples\txt_files\insectos.txt", + number_generated=100, + img_druglike=True, + path_destination_molecules=r"examples/generated_molecules/generated_molecules.txt") diff --git a/src/pymol_3d.py b/src/pymol_3d.py index 3f46983..f7fe88e 100644 --- a/src/pymol_3d.py +++ b/src/pymol_3d.py @@ -1,49 +1,60 @@ - -''' +""" This script is used to convert a 3D molecule from SDF format to STL format. -In order to run this script, you need to install PyMOl first, and can't be run directly through terminal, but need to be run through PyMol. +In order to run this script, you need to install PyMOL first. It can't be run directly through the terminal, but needs to be run through PyMOL. You can change the output format by changing the extension of the output file in cmd.save(). -''' - +""" import __main__ import os -__main__.pymol_argv = [ 'pymol', '-qc'] # Quiet and no GUI import pymol import requests -pymol.finish_launching() from pymol import cmd - +# Configure PyMOL to run in quiet mode with no GUI +__main__.pymol_argv = ['pymol', '-qc'] +pymol.finish_launching() def obtain_sdf(smile="", name="molecule_sdf"): """ - Function to obtain sdf file of an smile sequence in order to use it on PyMol - + Function to obtain an SDF file of a molecule given its SMILES sequence. + Parameters: - -smile: Sequence of the molecule in smile format. - -name: Name of the sdf file. + smile (str): SMILES sequence of the molecule. + name (str): Name of the SDF file to be saved (without extension). + """ - request=requests.get(f"https://cactus.nci.nih.gov/chemical/structure/{smile}/file?format=sdf&get3d=True") - sdf_file=request.text - print(sdf_file) + response = requests.get(f"https://cactus.nci.nih.gov/chemical/structure/{smile}/file?format=sdf&get3d=True") + sdf_file = response.text + if "Page not found" in sdf_file: print("SDF file not found") return "SDF file not found" else: - with open(f"{name}.sdf","w") as f: + with open(f"{name}.sdf", "w") as f: f.write(sdf_file) print("SDF file created") - + return "SDF file created" def get_pymol(smile="", name="molecule_sdf"): - if not name in os.listdir(): + + """ + Convert an SDF file to STL format using PyMOL. + + Parameters: + smile (str): SMILES sequence of the molecule. + name (str): Name of the SDF file to be used (without extension). + """ + + if not os.path.isfile(f"{name}.sdf"): obtain_sdf(smile=smile, name=name) + cmd.load(f"{name}.sdf") + cmd.save('molecule_stl.stl') + cmd.quit() - -get_pymol(smile='CC(=O)OC1=CC=CC=C1C(=O)O', name='molecule_sdf') # Change the smile and output name here \ No newline at end of file +# Example usage +get_pymol(smile='CC(=O)OC1=CC=CC=C1C(=O)O', name='molecule_sdf')