-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathtop_layer.py
367 lines (299 loc) · 14.5 KB
/
top_layer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
import pandas as pd
import numpy as np
from sklearn import linear_model, preprocessing
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from scipy.spatial.distance import cdist
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.exceptions import ConvergenceWarning
import xgboost
from sklearn.decomposition import PCA
import warnings
import random
import time
import os
import sys
import argparse
import torch
from sklearn.cluster import KMeans
from Bio import SeqIO
# Ignore FutureWarnings and ConvergenceWarnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning, module="sklearn.neural_network")
pd.options.mode.chained_assignment = None # default='warn'
# Function to read in the data
def read_data(dataset_name, base_path, file_type, embeddings_type='both', experimental = False):
# Construct the file paths
"""
Read in data from files based on the specified parameters.
Args:
dataset_name (str): Name of the dataset
base_path (str): Base path for the data files
file_type (str): Type of files to read ('csvs' or 'pts')
embeddings_type (str): Type of embeddings to use ('average', 'mutated', or 'both')
experimental (bool): Whether this is experimental data
Returns:
tuple: embeddings, labels, and hierarchy data (if not experimental)
"""
if file_type == "csvs":
labels_file = os.path.join(base_path, 'labels', dataset_name.split('_')[0] + '_labels.csv')
print(labels_file)
hie_file = os.path.join(base_path, 'hie_temp', dataset_name.split('_')[0] + '.csv')
embeddings_file = os.path.join(base_path, 'csvs', dataset_name + '.csv')
print(embeddings_file)
# Read in mean embeddings across all rounds
embeddings = pd.read_csv(embeddings_file, index_col=0)
elif file_type == "pts":
labels_file = os.path.join(base_path, 'labels', dataset_name.split('_')[-1] + '_labels.csv')
print(labels_file)
hie_file = os.path.join(base_path, 'hie_temp', dataset_name.split('_')[-1] + '.csv')
embeddings_file = os.path.join(base_path, 'pts', dataset_name + '.pt')
print(embeddings_file)
# Read in pytorch tensor of embeddings
embeddings = torch.load(embeddings_file)
# Convert embeddings to a dataframe
if embeddings_type == 'average':
embeddings = {key: value['average'].numpy() for key, value in embeddings.items()}
elif embeddings_type == 'mutated':
embeddings = {key: value['mutated'].numpy() for key, value in embeddings.items()}
elif embeddings_type == 'both':
embeddings = {key: torch.cat((value['average'], value['mutated'])).numpy() for key, value in embeddings.items()}
else:
print("Invalid embeddings_type. Please choose 'average', 'mutated', or 'both'")
return None, None
# Convert embeddings dictionary to a dataframe
embeddings = pd.DataFrame.from_dict(embeddings, orient='index')
else:
print("Invalid file type. Please choose either 'csvs' or 'pts'")
return None, None
# if not experimental
if not experimental:
# Read in labels
labels = pd.read_csv(labels_file)
# Read in hierarchy
hie_data = pd.read_csv(hie_file)
# Filter out rows where fitness is NaN
labels = labels[labels['fitness'].notna()]
# Filter out rows in embeddings where row names are not in labels variant column
embeddings = embeddings[embeddings.index.isin(labels['variant'])]
# Align labels by variant
labels = labels.sort_values(by=['variant'])
# Align embeddings by row name
embeddings = embeddings.sort_index()
# Confirm that labels and embeddings are aligned, reset index
labels = labels.reset_index(drop=True)
# Get the variants in labels and embeddings, convert to list
label_variants = labels['variant'].tolist()
embedding_variants = embeddings.index.tolist()
# Check if embedding row names and label variants are identical
if label_variants == embedding_variants:
print('Embeddings and labels are aligned')
# return embeddings and labels
return embeddings, labels, hie_data
else:
return embeddings
# Active learning function for one iteration
def top_layer(iter_train, iter_test, embeddings_pd, labels_pd, measured_var, regression_type='ridge', top_n=None, final_round=10):
"""
Perform one iteration of active learning.
Args:
iter_train (list): Iterations to use for training
iter_test (int): Iteration to use for testing
embeddings_pd (pd.DataFrame): Embeddings data
labels_pd (pd.DataFrame): Labels data
measured_var (str): Variable to predict
regression_type (str): Type of regression model to use
top_n (int): Number of top predictions to return
final_round (int): Final round number
Returns:
tuple: Test predictions and all predictions
"""
# Get the variants in labels and embeddings, convert to list
label_variants = labels_pd['variant'].tolist()
embedding_variants = embeddings_pd.index.tolist()
# Check if embedding row names and label variants are identical
if label_variants == embedding_variants:
print('Embeddings and labels are aligned')
# reset the indices of embeddings_pd and labels_pd
embeddings_pd = embeddings_pd.reset_index(drop=True)
labels_pd = labels_pd.reset_index(drop=True)
# save column 'iteration' in the labels dataframe
iteration = labels_pd['iteration']
# save labels
labels = labels_pd
# save mean embeddings as numpy array
a = embeddings_pd
# subset a, y to only include the rows where iteration = iter_train and iter_test
idx_train = iteration[iteration.isin(iter_train)].index.to_numpy()
idx_test = iteration[iteration.isin([iter_test])].index.to_numpy()
# subset a to only include the rows where iteration = iter_train and iter_test
X_train = a.loc[idx_train, :]
X_test = a.loc[idx_test, :]
y_train = labels[iteration.isin(iter_train)][measured_var]
y_test = labels[iteration.isin([iter_test])][measured_var]
# fit
if regression_type == 'ridge':
model = linear_model.RidgeCV()
elif regression_type == 'lasso':
model = linear_model.LassoCV(max_iter=100000,tol=1e-3)
elif regression_type == 'elasticnet':
model = linear_model.ElasticNetCV(max_iter=100000,tol=1e-3)
elif regression_type == 'linear':
model = linear_model.LinearRegression()
elif regression_type == 'neuralnet':
model = MLPRegressor(hidden_layer_sizes=(5), max_iter=1000, activation='relu', solver='adam', alpha=0.001,
batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5,
momentum=0.9, nesterovs_momentum=True, shuffle=True, random_state=1, tol=0.0001,
verbose=False, warm_start=False, early_stopping=False, validation_fraction=0.1, beta_1=0.9,
beta_2=0.999, epsilon=1e-08)
elif regression_type == 'randomforest':
model = RandomForestRegressor(n_estimators=100, criterion='friedman_mse', max_depth=None, min_samples_split=2,
min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto',
max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False,
n_jobs=None, random_state=1, verbose=0, warm_start=False, ccp_alpha=0.0,
max_samples=None)
elif regression_type == 'gradientboosting':
model = xgboost.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
max_depth=5, alpha=10, n_estimators=10)
model.fit(X_train, y_train)
# make predictions on train data
y_pred_train = model.predict(X_train)
y_std_train = np.zeros(len(y_pred_train))
# make predictions on test data
# NOTE: can work on alternate 2-n round strategies here
y_pred_test = model.predict(X_test)
y_std_test = np.zeros(len(y_pred_test))
# combine predicted and actual thermostability values with sequence IDs into a new dataframe
df_train = pd.DataFrame({'variant': labels.variant[idx_train], 'y_pred': y_pred_train, 'y_actual': y_train})
df_test = pd.DataFrame({'variant': labels.variant[idx_test], 'y_pred': y_pred_test, 'y_actual': y_test})
# sort df_test by y_pred
df_test = df_test.sort_values(by=['y_pred'], ascending=False)
df_all = pd.concat([df_train, df_test])
# sort df_all by y_pred
df_all = df_all.sort_values(by=['y_pred'], ascending=False)
return df_test, df_all
def read_experimental_data(base_path, round_file_name, WT_sequence, single_mutant=True):
"""
Read experimental data from an Excel file and process variant information.
Args:
base_path (str): Base path for the data file
round_file_name (str): Name of the Excel file
WT_sequence (str): Wild-type sequence
single_mutant (bool): Whether to process single mutants
Returns:
pd.DataFrame: Processed experimental data
"""
file_path = base_path + round_file_name
df = pd.read_excel(file_path)
# Iterate through the 'Variant' column and update the values based on t7_sequence
if single_mutant:
updated_variants = []
for _, row in df.iterrows():
variant = row['Variant']
if variant == 'WT':
updated_variants.append(variant)
else:
position = int(variant[:-1])
wt_aa = WT_sequence[position - 1]
updated_variant = wt_aa + variant
updated_variants.append(updated_variant)
df['updated_variant'] = updated_variants # Add the updated variants to the DataFrame
else:
df.rename(columns={'Variant': 'updated_variant'}, inplace=True)
return df
def create_dataframes(df_list, expected_index):
"""
Create and process dataframes from a list of experimental data.
Args:
df_list (list): List of dataframes with experimental data
expected_index (list): Expected index for the final dataframe
Returns:
tuple: Two processed dataframes
"""
# First dataframe
dfs = [] # List to store modified dataframes
for i, df in enumerate(df_list, start=1):
# Create a copy of the dataframe
df_copy = df_list[i - 1].copy()
# If the variant is WT, and i is equal to 1 assign iteration number 0
if i == 1:
df_copy.loc[df_copy['updated_variant'] == 'WT', 'iteration'] = 0
else:
df_copy = df_copy[df_copy['updated_variant'] != 'WT']
df_copy.loc[df_copy['updated_variant'] != 'WT', 'iteration'] = i
df_copy['iteration'] = df_copy['iteration'].astype(int)
df_copy.rename(columns={'updated_variant': 'variant'}, inplace=True) # Rename the column
dfs.append(df_copy)
df1 = pd.concat(dfs, ignore_index=True)
df2 = pd.concat(dfs, ignore_index=True)
# Check for duplicates in the 'variant' column of df1 or df2
df1_duplicates = df1[df1.duplicated(subset=['variant'], keep=False)]
df2_duplicates = df2[df2.duplicated(subset=['variant'], keep=False)]
if not df1_duplicates.empty or not df2_duplicates.empty:
print("Duplicates found in variant column:")
if not df1_duplicates.empty:
print("Duplicates in df1:")
print(df1_duplicates)
if not df2_duplicates.empty:
print("Duplicates in df2:")
print(df2_duplicates)
print("Exiting.")
return None, None
df1 = df1[['variant', 'iteration']]
df2 = df2[['variant', 'fitness', 'iteration']]
expected_index_blank = [variant for variant in expected_index if variant not in df2['variant'].tolist()]
# make a df_external that has a column 'variant' with all the variants in expected_index
df_external = pd.DataFrame({'variant': expected_index_blank})
df_external['fitness'] = np.nan
df_external['iteration'] = 1001
df2 = df2.append(df_external, ignore_index=True)
# order df2 by expected_index
df2 = df2.set_index('variant').reindex(expected_index, fill_value=np.nan).reset_index()
# rename column 'index' to 'variant'
df2 = df2.rename(columns={'index': 'variant'})
return df1, df2
# import data
dataset_name = 'hc_esm2_t48_15B_UR50D'
base_path = './results_means/'
file_type = 'csvs'
embeddings_type = 'average'
experimental = True
embeddings = read_data(dataset_name, base_path, file_type, embeddings_type, experimental)
# replace WT Wild-type sequence index in embeddings with 'WT'
embeddings = embeddings.rename(index={'WT Wild-type sequence': 'WT'})
print(embeddings)
experiment_path = './experimental_data/'
round_file_name = 'Antibody_Round1.xlsx'
protein_seq = 'MQVQLVESGGGVVQPGRSLRLSCAASGFTFSNYAMYWVRQAPGKGLEWVAVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRTEDTAVYYCASGSDYGDYLLVYWGQGTLVTVSSSTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKRVEPKSCDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSREEMTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPGK'
experimental_data = read_experimental_data(experiment_path, round_file_name, protein_seq)
print(experimental_data)
df_list = [experimental_data]
iterations_one, labels_one = create_dataframes(df_list, embeddings.index)
print (iterations_one)
print(labels_one)
iteration_old = iterations_one
embeddings_pd = embeddings
labels_pd = labels_one
measured_var = 'fitness'
regression_type = 'randomforest'
num_mutants_per_round = 16
final_round = 16
df_test, df_all = top_layer(
iter_train=iteration_old['iteration'].unique().tolist(),
iter_test=1001,
embeddings_pd=embeddings_pd,
labels_pd=labels_pd,
measured_var=measured_var,
regression_type=regression_type,
top_n=None,
final_round=final_round
)
#df_all.to_csv('prediction/round1_all_new.csv', index=False)
df_all = df_all[df_all['y_actual'].isna()][['variant', 'y_pred']]
df_all = df_all.sort_values(by=['y_pred'], ascending=False)
df_all.to_csv('prediction/round1_prediction.csv', index=False)