From 3ae2b2a20b01fda1236e6950d089f2bc6eac91aa Mon Sep 17 00:00:00 2001 From: Pablo Gonzalez Date: Wed, 18 Dec 2024 12:28:57 -0500 Subject: [PATCH] Remove unused rgat files (#1961) Co-authored-by: Miro --- graph/R-GAT/igbh/tiny/models/dataloader.py | 82 ------ graph/R-GAT/igbh/tiny/models/gnn.py | 296 --------------------- graph/R-GAT/igbh/tiny/models/main.py | 79 ------ graph/R-GAT/igbh/tiny/models/utils.py | 224 ---------------- 4 files changed, 681 deletions(-) delete mode 100644 graph/R-GAT/igbh/tiny/models/dataloader.py delete mode 100644 graph/R-GAT/igbh/tiny/models/gnn.py delete mode 100644 graph/R-GAT/igbh/tiny/models/main.py delete mode 100644 graph/R-GAT/igbh/tiny/models/utils.py diff --git a/graph/R-GAT/igbh/tiny/models/dataloader.py b/graph/R-GAT/igbh/tiny/models/dataloader.py deleted file mode 100644 index cc64d1466..000000000 --- a/graph/R-GAT/igbh/tiny/models/dataloader.py +++ /dev/null @@ -1,82 +0,0 @@ -import torch -from torch_geometric.data import InMemoryDataset, Data -from dgl.data import DGLDataset - -from utils import IGL260MDataset - -# TODO: Make a PyG dataloader for large datasets - - -class IGL260M_PyG(InMemoryDataset): - def __init__(self, args): - super().__init__(root, transform, pre_transform, pre_filter) - - def process(self): - dataset = IGL260MDataset(root=self.dir, size=args.dataset_size, - in_memory=args.in_memory, classes=args.type_classes, synthetic=args.synthetic) - node_features = torch.from_numpy(dataset.paper_feat) - node_edges = torch.from_numpy(dataset.paper_edge).T - node_labels = torch.from_numpy(dataset.paper_label).to(torch.long) - data = Data(x=node_features, edge_index=node_edges, y=node_labels) - - n_nodes = node_features.shape[0] - - n_train = int(n_nodes * 0.6) - n_val = int(n_nodes * 0.2) - - train_mask = torch.zeros(n_nodes, dtype=torch.bool) - val_mask = torch.zeros(n_nodes, dtype=torch.bool) - test_mask = torch.zeros(n_nodes, dtype=torch.bool) - - train_mask[:n_train] = True - val_mask[n_train:n_train + n_val] = True - test_mask[n_train + n_val:] = True - - data.train_mask = train_mask - data.val_mask = val_mask - data.test_mask = test_mask - - -class IGL260M_DGL(DGLDataset): - def __init__(self, args): - self.dir = args.path - super().__init__(name='IGB260M') - - def process(self): - dataset = IGL260MDataset(root=self.dir, size=args.dataset_size, - in_memory=args.in_memory, classes=args.type_classes, synthetic=args.synthetic) - node_features = torch.from_numpy(dataset.paper_feat) - node_edges = torch.from_numpy(dataset.paper_edge) - node_labels = torch.from_numpy(dataset.paper_label).to(torch.long) - - self.graph = dgl.graph( - (node_edges[:, 0], node_edges[:, 1]), num_nodes=node_features.shape[0]) - - self.graph.ndata['feat'] = node_features - self.graph.ndata['label'] = node_labels - - self.graph = dgl.remove_self_loop(self.graph) - self.graph = dgl.add_self_loop(self.graph) - - n_nodes = node_features.shape[0] - - n_train = int(n_nodes * 0.6) - n_val = int(n_nodes * 0.2) - - train_mask = torch.zeros(n_nodes, dtype=torch.bool) - val_mask = torch.zeros(n_nodes, dtype=torch.bool) - test_mask = torch.zeros(n_nodes, dtype=torch.bool) - - train_mask[:n_train] = True - val_mask[n_train:n_train + n_val] = True - test_mask[n_train + n_val:] = True - - self.graph.ndata['train_mask'] = train_mask - self.graph.ndata['val_mask'] = val_mask - self.graph.ndata['test_mask'] = test_mask - - def __getitem__(self, i): - return self.graph - - def __len__(self): - return 1 diff --git a/graph/R-GAT/igbh/tiny/models/gnn.py b/graph/R-GAT/igbh/tiny/models/gnn.py deleted file mode 100644 index 20d5ecd72..000000000 --- a/graph/R-GAT/igbh/tiny/models/gnn.py +++ /dev/null @@ -1,296 +0,0 @@ -from utils import IGL260MDataset -import warnings -from tqdm import tqdm -import numpy as np -import time -import torch.nn.functional as F -import torch.optim as optim -import torch.nn as nn -import dgl -from dgl.data import DGLDataset -import dgl.nn.pytorch as dglnn -from dgl.nn.pytorch import GATConv, GraphConv, SAGEConv -import os.path as osp -from sys import getsizeof - - -import torch -torch.manual_seed(0) -dgl.seed(0) -warnings.filterwarnings("ignore") - - -class GCN(nn.Module): - def __init__(self, - in_feats, - n_hidden, - n_classes, - n_layers, - activation, - dropout): - super(GCN, self).__init__() - self.layers = nn.ModuleList() - self.n_layers = n_layers - self.n_hidden = n_hidden - self.n_classes = n_classes - # input layer - self.layers.append( - GraphConv( - in_feats, - n_hidden, - activation=activation)) - # hidden layers - for i in range(n_layers - 1): - self.layers.append( - GraphConv( - n_hidden, - n_hidden, - activation=activation)) - # output layer - self.layers.append(GraphConv(n_hidden, n_classes)) - self.dropout = nn.Dropout(p=dropout) - self.activation = activation - - def forward(self, blocks, x): - h = x - for l, (layer, block) in enumerate(zip(self.layers, blocks)): - if l != len(self.layers) - 1: - # h = self.activation(h) - h = self.dropout(h) - h = layer(block, h) - return h - - def inference(self, g, x, batch_size, device): - """ - Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling). - g : the entire graph. - x : the input of entire node set. - The inference code is written in a fashion that it could handle any number of nodes and - layers. - """ - # During inference with sampling, multi-layer blocks are very inefficient because - # lots of computations in the first few layers are repeated. - # Therefore, we compute the representation of all nodes layer by layer. The nodes - # on each layer are of course splitted in batches. - # TODO: can we standardize this? - for l, layer in enumerate(self.layers): - y = torch.zeros(g.number_of_nodes(), self.n_hidden if l != - len(self.layers) - 1 else self.n_classes) - - sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) - dataloader = dgl.dataloading.NodeDataLoader( - g, - torch.arange(g.number_of_nodes()), - sampler, - batch_size=batch_size, - shuffle=True, - drop_last=False, - num_workers=4) - - for input_nodes, output_nodes, blocks in dataloader: - block = blocks[0] - - block = block.int().to(device) - h = x[input_nodes].to(device) - h = layer(block, h) - if l != len(self.layers) - 1: - h = self.activation(h) - h = self.dropout(h) - - y[output_nodes] = h.cpu() - - x = y - return y - - -class GAT(nn.Module): - def __init__( - self, in_feats, n_hidden, n_classes, n_layers, num_heads, activation - ): - super().__init__() - self.n_layers = n_layers - self.n_hidden = n_hidden - self.n_classes = n_classes - self.layers = nn.ModuleList() - self.layers.append( - dglnn.GATConv( - (in_feats, in_feats), - n_hidden, - num_heads=num_heads, - activation=activation, - ) - ) - for i in range(1, n_layers - 1): - self.layers.append( - dglnn.GATConv( - (n_hidden * num_heads, n_hidden * num_heads), - n_hidden, - num_heads=num_heads, - activation=activation, - ) - ) - self.layers.append( - dglnn.GATConv( - (n_hidden * num_heads, n_hidden * num_heads), - n_classes, - num_heads=num_heads, - activation=None, - ) - ) - - def forward(self, blocks, x): - h = x - for l, (layer, block) in enumerate(zip(self.layers, blocks)): - # We need to first copy the representation of nodes on the RHS from the - # appropriate nodes on the LHS. - # Note that the shape of h is (num_nodes_LHS, D) and the shape of h_dst - # would be (num_nodes_RHS, D) - h_dst = h[: block.num_dst_nodes()] - # Then we compute the updated representation on the RHS. - # The shape of h now becomes (num_nodes_RHS, D) - if l < self.n_layers - 1: - h = layer(block, (h, h_dst)).flatten(1) - else: - h = layer(block, (h, h_dst)) - h = h.mean(1) - return h.log_softmax(dim=-1) - - def inference(self, g, x, batch_size, device): - """ - Inference with the GAT model on full neighbors (i.e. without neighbor sampling). - g : the entire graph. - x : the input of entire node set. - The inference code is written in a fashion that it could handle any number of nodes and - layers. - """ - # During inference with sampling, multi-layer blocks are very inefficient because - # lots of computations in the first few layers are repeated. - # Therefore, we compute the representation of all nodes layer by layer. The nodes - # on each layer are of course splitted in batches. - # TODO: can we standardize this? - # TODO: make thiw into a variable - num_heads = 2 - for l, layer in enumerate(self.layers): - if l < self.n_layers - 1: - y = torch.zeros( - g.num_nodes(), - self.n_hidden * num_heads - if l != len(self.layers) - 1 - else self.n_classes, - ) - else: - y = torch.zeros( - g.num_nodes(), - self.n_hidden - if l != len(self.layers) - 1 - else self.n_classes, - ) - - sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) - dataloader = dgl.dataloading.DataLoader( - g, - torch.arange(g.num_nodes()), - sampler, - batch_size=batch_size, - shuffle=True, - drop_last=False, - num_workers=4, - ) - - for input_nodes, output_nodes, blocks in dataloader: - block = blocks[0].int().to(device) - - h = x[input_nodes].to(device) - h_dst = h[: block.num_dst_nodes()] - if l < self.n_layers - 1: - h = layer(block, (h, h_dst)).flatten(1) - else: - h = layer(block, (h, h_dst)) - h = h.mean(1) - h = h.log_softmax(dim=-1) - - y[output_nodes] = h.cpu() - - x = y - return y - - -class SAGE(nn.Module): - def __init__(self, - in_feats, - n_hidden, - n_classes, - n_layers, - activation, - dropout, - aggregator_type): - super().__init__() - self.n_layers = n_layers - self.n_hidden = n_hidden - self.n_classes = n_classes - self.layers = nn.ModuleList() - self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, aggregator_type)) - for i in range(1, n_layers - 1): - self.layers.append( - dglnn.SAGEConv( - n_hidden, - n_hidden, - aggregator_type)) - self.layers.append( - dglnn.SAGEConv( - n_hidden, - n_classes, - aggregator_type)) - self.dropout = nn.Dropout(dropout) - self.activation = activation - - def forward(self, blocks, x): - h = x - for l, (layer, block) in enumerate(zip(self.layers, blocks)): - h = layer(block, h) - if l != len(self.layers) - 1: - h = self.activation(h) - h = self.dropout(h) - return h - - def inference(self, g, x, batch_size, device): - """ - Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling). - g : the entire graph. - x : the input of entire node set. - The inference code is written in a fashion that it could handle any number of nodes and - layers. - """ - # During inference with sampling, multi-layer blocks are very inefficient because - # lots of computations in the first few layers are repeated. - # Therefore, we compute the representation of all nodes layer by layer. The nodes - # on each layer are of course splitted in batches. - # TODO: can we standardize this? - for l, layer in enumerate(self.layers): - y = torch.zeros(g.number_of_nodes(), self.n_hidden if l != - len(self.layers) - 1 else self.n_classes) - - sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1) - dataloader = dgl.dataloading.NodeDataLoader( - g, - torch.arange(g.number_of_nodes()), - sampler, - batch_size=batch_size, - shuffle=True, - drop_last=False, - num_workers=4) - - for input_nodes, output_nodes, blocks in dataloader: - block = blocks[0] - - block = block.int().to(device) - h = x[input_nodes].to(device) - h = layer(block, h) - if l != len(self.layers) - 1: - h = self.activation(h) - h = self.dropout(h) - - y[output_nodes] = h.cpu() - - x = y - return y diff --git a/graph/R-GAT/igbh/tiny/models/main.py b/graph/R-GAT/igbh/tiny/models/main.py deleted file mode 100644 index 4ab22eb75..000000000 --- a/graph/R-GAT/igbh/tiny/models/main.py +++ /dev/null @@ -1,79 +0,0 @@ -import argparse - - -def main(): - parser = argparse.ArgumentParser() - - # Input/output paths - parser.add_argument('--path', type=str, default='/gnndataset/') - parser.add_argument('--modelpath', type=str, default='gcn_19.pt') - - # Dataset selection - parser.add_argument( - '--dataset_size', - type=str, - default='experimental', - choices=[ - 'experimental', - 'small', - 'medium', - 'large', - 'full']) - parser.add_argument( - '--type_classes', - type=int, - default=19, - choices=[ - 19, - 292, - 2983]) - - # Hyperparameters - parser.add_argument('--hidden_channels', type=int, default=16) - parser.add_argument('--fan_out', type=str, default='5,10') - parser.add_argument('--num_layers', type=int, default=2) - parser.add_argument('--learning_rate', type=int, default=0.01) - parser.add_argument('--decay', type=int, default=0.001) - parser.add_argument('--num_workers', type=int, default=4) - parser.add_argument('--batch_size', type=int, default=2048 * 16) - parser.add_argument('--dropout', type=float, default=0.2) - parser.add_argument('--epochs', type=int, default=20) - parser.add_argument( - '--model_type', - type=str, - default='gcn', - choices=[ - 'gat', - 'sage', - 'gcn']) - parser.add_argument('--in_memory', type=int, default=0) - parser.add_argument('--synthetic', type=int, default=0) - parser.add_argument('--device', type=str, default='1') - args = parser.parse_args() - - print("Dataset_size: " + args.dataset_size) - print("Model : " + args.model) - print("Num_classes : " + str(args.num_classes)) - print() - - device = f'cuda:' + args.device if torch.cuda.is_available() else 'cpu' - - dataset = IGL260M_DGL(args) - g = dataset[0] - - best_test_acc, train_acc, test_acc = track_acc(g, args) - - print( - f"Train accuracy: {np.mean(train_acc):.2f} \u00B1 {np.std(train_acc):.2f} \t Best: {np.max(train_acc) * 100:.4f}%") - print( - f"Test accuracy: {np.mean(test_acc):.2f} \u00B1 {np.std(test_acc):.2f} \t Best: {np.max(test_acc) * 100:.4f}%") - print() - print(" -------- For debugging --------- ") - print("Parameters: ", args) - print(g) - print("Train accuracy: ", train_acc) - print("Test accuracy: ", test_acc) - - -if __name__ == '__main__': - main() diff --git a/graph/R-GAT/igbh/tiny/models/utils.py b/graph/R-GAT/igbh/tiny/models/utils.py deleted file mode 100644 index 5e9e1a25d..000000000 --- a/graph/R-GAT/igbh/tiny/models/utils.py +++ /dev/null @@ -1,224 +0,0 @@ -import numpy as np -import torch - - -class IGL260MDataset(object): - def __init__(self, root: str, size: str, in_memory: int, - classes: int, synthetic: int): - self.dir = root - self.size = size - self.synthetic = synthetic - self.in_memory = in_memory - self.num_classes = classes - self.__meta__ = torch.load(osp.join(self.dir, self.size, 'meta.pt')) - - self.num_features = self.__meta__['paper']['emb_dim'] - self.num_nodes = self.__meta__['paper']['num_node'] - self.num_edges = self.__meta__['cites']['num_edge'] - - @property - def paper_feat(self) -> np.ndarray: - if self.synthetic: - return np.random((self.num_nodes, self.num_edges)) - - path = osp.join( - self.dir, - self.size, - 'processed', - 'paper', - 'node_feat.npy') - if self.in_memory: - return np.load(path) - else: - return np.load(path, mmap_mode='r') - - @property - def paper_label(self) -> np.ndarray: - if self.num_classes == 19: - path = osp.join( - self.dir, - self.size, - 'processed', - 'paper', - 'node_label_19.npy') - else: - path = osp.join( - self.dir, - self.size, - 'processed', - 'paper', - 'node_label_2K.npy') - if self.in_memory: - return np.load(path) - else: - return np.load(path, mmap_mode='r') - - @property - def paper_edge(self) -> np.ndarray: - path = osp.join( - self.dir, - self.size, - 'processed', - 'paper__cites__paper', - 'edge_index.npy') - if self.in_memory: - return np.load(path) - else: - return np.load(path, mmap_mode='r') - - -def compute_acc(pred, labels): - """ - Compute the accuracy of prediction given the labels. - """ - labels = labels.long() - return (torch.argmax(pred, dim=1) == labels).float().sum() / len(pred) - - -def evaluate(model, g, inputs, labels, val_nid, batch_size, device): - """ - Evaluate the model on the validation set specified by ``val_nid``. - g : The entire graph. - inputs : The features of all the nodes. - labels : The labels of all the nodes. - val_nid : the node Ids for validation. - batch_size : Number of nodes to compute at the same time. - device : The GPU device to evaluate on. - """ - model.eval() - with torch.no_grad(): - pred = model.inference(g, inputs, batch_size, device) - model.train() - return compute_acc(pred[val_nid], labels[val_nid]) - - -def load_subtensor(g, seeds, input_nodes, device): - """ - Copys features and labels of a set of nodes onto GPU. - """ - batch_inputs = g.ndata['features'][input_nodes].to(device) - batch_labels = g.ndata['labels'][seeds].to(device) - return batch_inputs, batch_labels - - -def track_acc(g, args): - train_accuracy = [] - test_accuracy = [] - g.ndata['features'] = g.ndata['feat'] - g.ndata['labels'] = g.ndata['label'] - in_feats = g.ndata['features'].shape[1] - n_classes = args.num_classes - - # Create csr/coo/csc formats before launching training processes with multi-gpu. - # This avoids creating certain formats in each sub-process, which saves - # momory and CPU. - g.create_formats_() - - num_epochs = args.epochs - num_hidden = args.hidden_channels - num_layers = args.num_layers - fan_out = args.fan_out - batch_size = args.batch_size - lr = args.learning_rate - dropout = args.dropout - num_workers = args.num_workers - - train_nid = torch.nonzero(g.ndata['train_mask'], as_tuple=True)[0] - - # Create PyTorch DataLoader for constructing blocks - sampler = dgl.dataloading.MultiLayerNeighborSampler( - [int(fanout) for fanout in fan_out.split(',')]) - - dataloader = dgl.dataloading.NodeDataLoader( - g, - train_nid, - sampler, - batch_size=batch_size, - shuffle=True, - drop_last=False, - num_workers=num_workers) - - # Define model and optimizer - if args.model_type == 'gcn': - model = GCN(in_feats, num_hidden, n_classes, 1, F.relu, dropout) - if args.model_type == 'sage': - model = SAGE( - in_feats, - num_hidden, - n_classes, - num_layers, - F.relu, - dropout, - 'gcn') - if args.model_type == 'gat': - model = GAT(in_feats, num_hidden, n_classes, num_layers, 2, F.relu) - - model = model.to(device) - loss_fcn = nn.CrossEntropyLoss() - loss_fcn = loss_fcn.to(device) - optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=args.decay) - - # Training loop - avg = 0 - best_test_acc = 0 - log_every = 1 - training_start = time.time() - for epoch in (range(num_epochs)): - # Loop over the dataloader to sample the computation dependency graph as a list of - # blocks. - epoch_loss = 0 - gpu_mem_alloc = 0 - epoch_start = time.time() - for step, (input_nodes, seeds, blocks) in (enumerate(dataloader)): - # Load the input features as well as output labels - # batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device) - blocks = [block.int().to(device) for block in blocks] - batch_inputs = blocks[0].srcdata['features'] - batch_labels = blocks[-1].dstdata['labels'] - - # Compute loss and prediction - batch_pred = model(blocks, batch_inputs) - loss = loss_fcn(batch_pred, batch_labels) - optimizer.zero_grad() - loss.backward() - optimizer.step() - - epoch_loss += loss.detach() - - gpu_mem_alloc += ( - torch.cuda.max_memory_allocated() / 1000000 - if torch.cuda.is_available() - else 0 - ) - - train_g = g - train_nid = torch.nonzero( - train_g.ndata['train_mask'], as_tuple=True)[0] - train_acc = evaluate( - model, train_g, train_g.ndata['features'], train_g.ndata['labels'], train_nid, batch_size, device) - - test_g = g - test_nid = torch.nonzero( - test_g.ndata['test_mask'], as_tuple=True)[0] - test_acc = evaluate( - model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, batch_size, device) - - if test_acc.item() > best_test_acc: - best_test_acc = test_acc.item() - tqdm.write( - "Epoch {:05d} | Loss {:.4f} | Train Acc {:.4f} | Test Acc {:.4f} | Time {:.2f}s | GPU {:.1f} MB".format( - epoch, - epoch_loss, - train_acc.item(), - test_acc.item(), - time.time() - epoch_start, - gpu_mem_alloc - ) - ) - test_accuracy.append(test_acc.item()) - train_accuracy.append(train_acc.item()) - torch.save(model.state_dict(), args.modelpath) - print() - print("Total time taken: ", time.time() - training_start) - - return best_test_acc, train_accuracy, test_accuracy