-
-
Notifications
You must be signed in to change notification settings - Fork 132
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Format inconsistency between graphs constructed by pdb_code and pdb_path? #142
Comments
Hi @johnnytam100 I don't believe this is a bug. I don't think checking for equality with class Test:
val = 1
a = Test()
b = Test()
a == b
# Output: False However, we can define an class Test:
def __init__(self):
self.val = 1
def __eq__(self, other):
if self.val == other.val:
return True
else:
return False
a = Test()
b = Test()
a == b
# Output: True We can see this is the case here when we make two of the 'same' graph: g_pdbcode_1 = construct_graph(config=config, pdb_code="1ema")
g_pdbcode_2 = construct_graph(config=config, pdb_code="1ema")
g_pdbcode_1 == g_pdbcode_2
# Output: False Instead, we should check if the graphs are the same, rather than if the python objects are the same. You can do this with import networkx as nx
nx.is_isomorphic(g_pdbcode, g_pdbpath)
# Output: True (I have tried this on the example you provided) The A fuller and more robust test (which checks node and edge attributes) would be: import numpy as np
def equal_dictionaries(dic1, dic2):
for key, value in dic1.items():
key1 = key
value1 = value
for key, value in dic2.items():
key2 = key
value2 = value
if np.array_equal(value1, value2) == False or key1 != key2:
return False
else:
return True
nx.is_isomorphic(g_pdbcode, g_pdbpath, node_match=equal_dictionaries, edge_match=equal_dictionaries) I can add this to Graphein as I think it would be a useful feature. |
Arian, I am sorry for the silly testing using I see your tests, so you confirmed the graph from Why I raised this question was because when I used graphs constructed from However, when I used graphs constructed from import pickle
import networkx as nx
import os
import glob
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from graphein.ml.conversion import GraphFormatConvertor
from tqdm.notebook import tqdm
import numpy as np
from torch_geometric.data import DataLoader
from torch_geometric.nn import GCNConv, GATConv, SAGEConv, global_add_pool
from torch.nn.functional import mse_loss, nll_loss, relu, softmax, cross_entropy
from torch.nn import functional as F
from torchmetrics.functional import accuracy
from pytorch_lightning.callbacks import ModelCheckpoint
import os
import random
import pytorch_lightning as pl
import torch.nn as nn
import pandas as pd
# Load
fp_df = pd.read_csv("./target_mapping.csv")
# Label col
label_col = "states.0.em_max"
# MAE loss
mae_loss = nn.L1Loss()
# collect graphs
path_list = []
graph_list = []
for path in glob.iglob('./selected_grep-ATOM/*.p'):
path_list.append(path)
path_list.sort()
for path in path_list:
with open(path, 'rb') as f: # notice the r instead of w
graph = pickle.load(f)
graph_list.append(graph)
# nx2pyg
format_convertor = GraphFormatConvertor('nx', 'pyg',
verbose = 'gnn',
columns = None)
pyg_list = [format_convertor(graph) for graph in tqdm(graph_list)]
# assign target
for (idx, g), p in zip(enumerate(pyg_list), path_list):
if not fp_df.loc[fp_df["uuid"]==p[21:26]][label_col].isnull().values[0]:
# g.y = y_list[idx] # original
g.y = int(fp_df.loc[fp_df["uuid"]==p[21:26]][label_col].values[0]) # regression
g.coords = torch.FloatTensor(g.coords[0])
# other formatting (?)
for i in pyg_list:
if i.coords[0].shape[0] == len(i.node_id):
pass
else:
print(i)
pyg_list.remove(i)
# train, val, test split
np.random.seed(42)
idx_all = np.arange(len(pyg_list))
np.random.shuffle(idx_all)
train_idx, valid_idx, test_idx = np.split(idx_all, [int(.8*len(pyg_list)), int(.9*len(pyg_list))])
train, valid, test = [pyg_list[i] for i in train_idx], [pyg_list[i] for i in valid_idx], [pyg_list[i] for i in test_idx]
# compile model
config_default = dict(
n_hid = 8,
n_out = 8,
batch_size = 4,
dropout = 0.5,
lr = 0.005,
num_heads = 32,
num_att_dim = 64,
model_name = 'GAT'
)
class Struct:
def __init__(self, **entries):
self.__dict__.update(entries)
config = Struct(**config_default)
global model_name
model_name = config.model_name
class GraphNets(pl.LightningModule):
def __init__(self):
super().__init__()
if model_name == 'GCN':
self.layer1 = GCNConv(in_channels=3, out_channels=config.n_hid)
self.layer2 = GCNConv(in_channels=config.n_hid, out_channels=config.n_out)
elif model_name == 'GAT':
self.layer1 = GATConv(3, config.num_att_dim, heads=config.num_heads, dropout=config.dropout)
self.layer2 = GATConv(config.num_att_dim * config.num_heads, out_channels = config.n_out, heads=1, concat=False,
dropout=config.dropout)
elif model_name == 'GraphSAGE':
self.layer1 = SAGEConv(3, config.n_hid)
self.layer2 = SAGEConv(config.n_hid, config.n_out)
self.decoder = nn.Linear(config.n_out, 1)
def forward(self, g):
x = g.coords
x = F.dropout(x, p=config.dropout, training=self.training)
x = F.elu(self.layer1(x, g.edge_index))
x = F.dropout(x, p=config.dropout, training=self.training)
x = self.layer2(x, g.edge_index)
x = global_add_pool(x, batch=g.batch)
x = self.decoder(x)
# return softmax(x) # original
return x
def training_step(self, batch, batch_idx):
x = batch
y = x.y
y_hat = self(x)
# loss = cross_entropy(y_hat, y) # original
loss = mae_loss(y_hat, y.float())
# acc = accuracy(y_hat, y) # original
self.log("train_loss", loss)
# self.log("train_acc", acc) # original
return loss
def validation_step(self, batch, batch_idx):
x = batch
y = x.y
y_hat = self(x)
# loss = cross_entropy(y_hat, y) # original
loss = mae_loss(y_hat, y.float())
# acc = accuracy(y_hat, y) # original
self.log("valid_loss", loss)
# self.log("valid_acc", acc) # original
def test_step(self, batch, batch_idx):
x = batch
y = x.y
y_hat = self(x)
# loss = cross_entropy(y_hat, y) # original
loss = mae_loss(y_hat, y.float())
# acc = accuracy(y_hat, y) # original
# y_pred_softmax = torch.log_softmax(y_hat, dim = 1) # original
# y_pred_tags = torch.argmax(y_pred_softmax, dim = 1) # original
# f1 = f1_score(y.detach().cpu().numpy(), y_pred_tags.detach().cpu().numpy(), average = 'weighted') # original
self.log("test_loss", loss)
# self.log("test_acc", acc) # original
# self.log("test_f1", f1) # original
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=config.lr)
return optimizer
GraphNets()
file_path = './graphein_model'
if not os.path.exists(file_path):
os.mkdir(file_path)
checkpoint_callback = ModelCheckpoint(
monitor="valid_loss",
dirpath=file_path,
filename="model-{epoch:02d}-{val_loss:.2f}",
save_top_k=1,
mode="min",
)
# data loader
train_loader = DataLoader(train, batch_size=config.batch_size, shuffle = True, drop_last = True)
valid_loader = DataLoader(valid, batch_size=32)
test_loader = DataLoader(test, batch_size=32)
# train model
model = GraphNets()
trainer = pl.Trainer(max_epochs=400, gpus=-1, callbacks=[checkpoint_callback])
trainer.fit(model, train_loader, valid_loader)
# evaluate on the model with the best validation set
best_model = GraphNets.load_from_checkpoint(checkpoint_callback.best_model_path)
out_best_test = trainer.test(best_model, test_loader)[0]
Do you know what could be the possibility that caused the difference between input graphs from |
In particular, I want to understand this part, what does it do? # other formatting (?)
for i in pyg_list:
if i.coords[0].shape[0] == len(i.node_id):
pass
else:
print(i)
pyg_list.remove(i) |
Hey @johnnytam100, no problem at all. I’ll take a closer look at your code later today. With respect to the code block you quoted: # other formatting (?)
for i in pyg_list:
if i.coords[0].shape[0] == len(i.node_id):
pass
else:
print(i)
pyg_list.remove(i) This loops over the list of converted graphs, and simply checks if the shape of the coordinate array matches the number of nodes in the graph. E.g. do we have a coordinate for each node and do we have a node for each coordinate. If these don’t match, we remove the graph from the list. This can throw off indexing with labels so be careful in using it. |
So, I'm not sure why this would be different between the graphs created from pdb files and from pdb codes but I think the problem is here: for (idx, g), p in zip(enumerate(pyg_list), path_list):
if not fp_df.loc[fp_df["uuid"]==p[21:26]][label_col].isnull().values[0]:
# g.y = y_list[idx] # original
g.y = int(fp_df.loc[fp_df["uuid"]==p[21:26]][label_col].values[0]) # regression
g.coords = torch.FloatTensor(g.coords[0]) From what I understand, you're checking to see if the dataset has a label for a particular example. If it does, you assign the label to I think the correct way to do this is a very simple fix: for (idx, g), p in zip(enumerate(pyg_list), path_list):
if not fp_df.loc[fp_df["uuid"]==p[21:26]][label_col].isnull().values[0]:
# g.y = y_list[idx] # original
g.y = int(fp_df.loc[fp_df["uuid"]==p[21:26]][label_col].values[0]) # regression
g.coords = torch.FloatTensor(g.coords[0])
else:
pyg_list.remove(g) |
@a-r-j Thank you so much for helping out!!! Yes, you exactly described what I was trying to do. I wrote something very similar before, but then there is another error:
However, I now got a fix that I don't know why: by writing two consecutive loops to remove the graph without # Load
fp_df = pd.read_csv("./20220307_fpbase_all.csv")
# Label col
label_col = "states.0.em_max"
# MAE loss
mae_loss = nn.L1Loss()
# graphs from pdb_path
path_list = []
graph_list = []
for path in glob.iglob('./selected_grep-ATOM/*.p'):
path_list.append(path)
path_list.sort()
for path in path_list:
with open(path, 'rb') as f: # notice the r instead of w
graph = pickle.load(f)
graph_list.append(graph)
# nx2pyg
format_convertor = GraphFormatConvertor('nx', 'pyg',
verbose = 'gnn',
columns = None)
pyg_list = [format_convertor(graph) for graph in tqdm(graph_list)]
# assign target
for (idx, g), p in zip(enumerate(pyg_list), path_list):
if not fp_df.loc[fp_df["uuid"]==p[21:26]][label_col].isnull().values[0]:
# g.y = y_list[idx] # original
g.y = int(fp_df.loc[fp_df["uuid"]==p[21:26]][label_col].values[0])
g.coords = torch.FloatTensor(g.coords[0])
# other formatting (?)
for i in pyg_list:
if i.coords.shape[0] == len(i.node_id):
pass
else:
print(i)
pyg_list.remove(i)
for i in pyg_list:
if i.y == None:
print(i)
pyg_list.remove(i)
# still have one graph don't have y, I don't know why
for i in pyg_list:
if i.y == None:
print(i)
pyg_list.remove(i)
# now all graphs have y |
Hi Arian! Seems there is format inconsistency between graphs constructed by
pdb_code
andpdb_path
:The text was updated successfully, but these errors were encountered: