optimizing VICReg (#173)

* add evaluate script * set VICReg to evaluation mode * print best validation loss on vicreg loss plot * multigpu compatibility * Revert to bd2a3e8 * fix loading of mlpf models * add plots for each component of each loss * oops * add notebook for ssl optimizations * fix quick datasplit mode to make sense * add vicreg embeddings after gnn in mlpf * for num_convs=0 * mdmm * add extra dnn layer for native * early results * pooling after decoder * update vicreg, loss and multigpu * up early stopping * better gpu util --------- Co-authored-by: Joosep Pata <joosep.pata@gmail.com>
jpata · Mar 1, 2023 · 9d84f26 · 9d84f26
1 parent 359b892
commit 9d84f26
Show file tree

Hide file tree

Showing 11 changed files with 21,953 additions and 423 deletions.
diff --git a/mlpf/mdmm.ipynb b/mlpf/mdmm.ipynb
diff --git a/mlpf/pyg_ssl/VICReg.py b/mlpf/pyg_ssl/VICReg.py
@@ -1,12 +1,66 @@
 import torch.nn as nn
+from torch_geometric.data import Batch
+from torch_geometric.nn import global_mean_pool
 from torch_geometric.nn.conv import GravNetConv
 
 from .utils import CLUSTERS_X, TRACKS_X
 
 
-# define the Encoder that learns latent representations of tracks and clusters
-# these representations will be used by MLPF which is the downstream task
+class VICReg(nn.Module):
+    def __init__(self, encoder, decoder):
+        super(VICReg, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+
+    def distinguish_PFelements(self, batch):
+        """Takes an event~Batch() and splits it into two Batch() objects representing the tracks/clusters."""
+
+        track_id = 1
+        cluster_id = 2
+
+        tracks = Batch(
+            x=batch.x[batch.x[:, 0] == track_id][:, 1:].float()[
+                :, :TRACKS_X
+            ],  # remove the first input feature which is not needed anymore
+            ygen=batch.ygen[batch.x[:, 0] == track_id],
+            ygen_id=batch.ygen_id[batch.x[:, 0] == track_id],
+            ycand=batch.ycand[batch.x[:, 0] == track_id],
+            ycand_id=batch.ycand_id[batch.x[:, 0] == track_id],
+            batch=batch.batch[batch.x[:, 0] == track_id],
+        )
+        clusters = Batch(
+            x=batch.x[batch.x[:, 0] == cluster_id][:, 1:].float()[
+                :, :CLUSTERS_X
+            ],  # remove the first input feature which is not needed anymore
+            ygen=batch.ygen[batch.x[:, 0] == cluster_id],
+            ygen_id=batch.ygen_id[batch.x[:, 0] == cluster_id],
+            ycand=batch.ycand[batch.x[:, 0] == cluster_id],
+            ycand_id=batch.ycand_id[batch.x[:, 0] == cluster_id],
+            batch=batch.batch[batch.x[:, 0] == cluster_id],
+        )
+        return tracks, clusters
+
+    def forward(self, event):
+
+        # seperate tracks from clusters
+        tracks, clusters = self.distinguish_PFelements(event)
+
+        # encode to retrieve the representations
+        track_representations, cluster_representations = self.encoder(tracks, clusters)
+
+        # decode/expand to get the embeddings
+        embedding_tracks, embedding_clusters = self.decoder(track_representations, cluster_representations)
+
+        # global pooling to be able to compute a loss
+        pooled_tracks = global_mean_pool(embedding_tracks, tracks.batch)
+        pooled_clusters = global_mean_pool(embedding_clusters, clusters.batch)
+
+        return pooled_tracks, pooled_clusters
+
+
 class ENCODER(nn.Module):
+    """The Encoder part of VICReg which attempts to learns useful latent representations of tracks and clusters."""
+
     def __init__(
         self,
         width=126,
@@ -66,8 +120,10 @@ def forward(self, tracks, clusters):
         return embedding_tracks, embedding_clusters
 
 
-# define the decoder that expands the latent representations of tracks and clusters
 class DECODER(nn.Module):
+    """The Decoder part of VICReg which attempts to expand the learned latent representations
+    of tracks and clusters into a space where a loss can be computed."""
+
     def __init__(
         self,
         input_dim=34,

diff --git a/mlpf/pyg_ssl/args.py b/mlpf/pyg_ssl/args.py
@@ -28,17 +28,17 @@ def parse_args():
     parser.add_argument("--overwrite", dest="overwrite", action="store_true", help="overwrites the model if True")
 
     # training hyperparameters
-    parser.add_argument("--lmbd", type=float, default=0.01, help="the lambda term in the VICReg loss")
-    parser.add_argument("--u", type=float, default=0.1, help="the mu term in the VICReg loss")
-    parser.add_argument("--v", type=float, default=0.1, help="the nu term in the VICReg loss")
+    parser.add_argument("--lmbd", type=float, default=1, help="the lambda term in the VICReg loss")
+    parser.add_argument("--mu", type=float, default=0.1, help="the mu term in the VICReg loss")
+    parser.add_argument("--nu", type=float, default=1e-9, help="the nu term in the VICReg loss")
     parser.add_argument("--n_epochs_mlpf", type=int, default=3, help="number of training epochs for mlpf")
     parser.add_argument("--n_epochs_VICReg", type=int, default=3, help="number of training epochs for VICReg")
     parser.add_argument("--lr", type=float, default=5e-5, help="learning rate")
-    parser.add_argument("--batch_size_mlpf", type=int, default=500, help="number of events to process at once")
-    parser.add_argument("--batch_size_VICReg", type=int, default=2000, help="number of events to process at once")
+    parser.add_argument("--bs_mlpf", type=int, default=500, help="number of events to process at once")
+    parser.add_argument("--bs_VICReg", type=int, default=2000, help="number of events to process at once")
     parser.add_argument("--patience", type=int, default=50, help="patience before early stopping")
     parser.add_argument(
-        "--FineTune_VICReg", dest="FineTune_VICReg", action="store_true", help="FineTune VICReg during MLPFtraining"
+        "--FineTune_VICReg", dest="FineTune_VICReg", action="store_true", help="FineTune VICReg during MLPF training"
     )
 
     # VICReg encoder architecture

diff --git a/mlpf/pyg_ssl/evaluate.py b/mlpf/pyg_ssl/evaluate.py
@@ -42,7 +42,7 @@ def particle_array_to_awkward(batch_ids, arr_id, arr_p4):
     return ret
 
 
-def evaluate(device, encoder, decoder, mlpf, batch_size_mlpf, mode, outpath, samples):
+def evaluate(device, encoder, mlpf, batch_size_mlpf, mode, outpath, samples):
     import fastjet
     import vector
     from jet_utils import build_dummy_array, match_two_jet_collections
@@ -60,7 +60,6 @@ def evaluate(device, encoder, decoder, mlpf, batch_size_mlpf, mode, outpath, sam
 
     mlpf.eval()
     encoder.eval()
-    decoder.eval()
     for sample, data in samples.items():
         print(f"Testing the {mode} model on the {sample}")
 
@@ -74,7 +73,6 @@ def evaluate(device, encoder, decoder, mlpf, batch_size_mlpf, mode, outpath, sam
 
         mlpf.eval()
         encoder.eval()
-        decoder.eval()
 
         conf_matrix = np.zeros((6, 6))
         with torch.no_grad():

diff --git a/mlpf/pyg_ssl/mlpf.py b/mlpf/pyg_ssl/mlpf.py
@@ -51,25 +51,49 @@ def forward(self, x, mask):
         return x
 
 
-def ffn(input_dim, output_dim, width, act, dropout):
-    return nn.Sequential(
-        nn.Linear(input_dim, width),
-        act(),
-        torch.nn.LayerNorm(width),
-        nn.Dropout(dropout),
-        nn.Linear(width, width),
-        act(),
-        torch.nn.LayerNorm(width),
-        nn.Dropout(dropout),
-        nn.Linear(width, width),
-        act(),
-        torch.nn.LayerNorm(width),
-        nn.Dropout(dropout),
-        nn.Linear(width, width),
-        act(),
-        torch.nn.LayerNorm(width),
-        nn.Linear(width, output_dim),
-    )
+def ffn(input_dim, output_dim, width, act, dropout, ssl):
+    if ssl:
+        return nn.Sequential(
+            nn.Linear(input_dim, width),
+            act(),
+            torch.nn.LayerNorm(width),
+            nn.Dropout(dropout),
+            nn.Linear(width, width),
+            act(),
+            torch.nn.LayerNorm(width),
+            nn.Dropout(dropout),
+            nn.Linear(width, width),
+            act(),
+            torch.nn.LayerNorm(width),
+            nn.Dropout(dropout),
+            nn.Linear(width, width),
+            act(),
+            torch.nn.LayerNorm(width),
+            nn.Linear(width, output_dim),
+        )
+    else:
+        return nn.Sequential(
+            nn.Linear(input_dim, width),
+            act(),
+            torch.nn.LayerNorm(width),
+            nn.Dropout(dropout),
+            nn.Linear(width, width),
+            act(),
+            torch.nn.LayerNorm(width),
+            nn.Dropout(dropout),
+            nn.Linear(width, width),
+            act(),
+            torch.nn.LayerNorm(width),
+            nn.Dropout(dropout),
+            nn.Linear(width, width),
+            act(),
+            torch.nn.LayerNorm(width),
+            nn.Dropout(dropout),
+            nn.Linear(width, width),
+            act(),
+            torch.nn.LayerNorm(width),
+            nn.Linear(width, output_dim),
+        )
 
 
 class MLPF(nn.Module):
@@ -80,99 +104,117 @@ def __init__(
         width=126,
         num_convs=2,
         k=32,
-        native_mlpf=False,
         propagate_dimensions=32,
         space_dimensions=4,
         dropout=0.4,
+        ssl=False,
+        VICReg_embedding_dim=0,
     ):
         super(MLPF, self).__init__()
 
         self.act = nn.ELU
-        self.native_mlpf = native_mlpf  # boolean that is true for native mlpf and false for ssl
         self.dropout = dropout
+        self.input_dim = input_dim
+        self.num_convs = num_convs
+        self.ssl = ssl  # boolean that is True for ssl and False for native mlpf
 
         # embedding of the inputs
-        self.nn0 = nn.Sequential(
-            nn.Linear(input_dim, width),
-            self.act(),
-            nn.Linear(width, width),
-            self.act(),
-            nn.Linear(width, width),
-            self.act(),
-            nn.Linear(width, embedding_dim),
-        )
-
-        self.conv_type = "gravnet"
-        # GNN that uses the embeddings learnt by VICReg as the input features
-        if self.conv_type == "gravnet":
-            self.conv_id = nn.ModuleList()
-            self.conv_reg = nn.ModuleList()
-            for i in range(num_convs):
-                self.conv_id.append(GravNetLayer(embedding_dim, space_dimensions, propagate_dimensions, k, dropout))
-                self.conv_reg.append(GravNetLayer(embedding_dim, space_dimensions, propagate_dimensions, k, dropout))
-        elif self.conv_type == "attention":
-            self.conv_id = nn.ModuleList()
-            self.conv_reg = nn.ModuleList()
-
-            for i in range(num_convs):
-                self.conv_id.append(SelfAttentionLayer(embedding_dim))
-                self.conv_reg.append(SelfAttentionLayer(embedding_dim))
+        if num_convs != 0:
+            self.nn0 = nn.Sequential(
+                nn.Linear(input_dim, width),
+                self.act(),
+                nn.Linear(width, width),
+                self.act(),
+                nn.Linear(width, width),
+                self.act(),
+                nn.Linear(width, embedding_dim),
+            )
+
+            self.conv_type = "gravnet"
+            # GNN that uses the embeddings learnt by VICReg as the input features
+            if self.conv_type == "gravnet":
+                self.conv_id = nn.ModuleList()
+                self.conv_reg = nn.ModuleList()
+                for i in range(num_convs):
+                    self.conv_id.append(GravNetLayer(embedding_dim, space_dimensions, propagate_dimensions, k, dropout))
+                    self.conv_reg.append(GravNetLayer(embedding_dim, space_dimensions, propagate_dimensions, k, dropout))
+            elif self.conv_type == "attention":
+                self.conv_id = nn.ModuleList()
+                self.conv_reg = nn.ModuleList()
+
+                for i in range(num_convs):
+                    self.conv_id.append(SelfAttentionLayer(embedding_dim))
+                    self.conv_reg.append(SelfAttentionLayer(embedding_dim))
 
         decoding_dim = input_dim + num_convs * embedding_dim
+        if ssl:
+            decoding_dim += VICReg_embedding_dim
 
         # DNN that acts on the node level to predict the PID
-        self.nn_id = ffn(decoding_dim, NUM_CLASSES, width, self.act, dropout)
+        self.nn_id = ffn(decoding_dim, NUM_CLASSES, width, self.act, dropout, ssl)
 
         # elementwise DNN for node momentum regression
-        self.nn_pt = ffn(decoding_dim + NUM_CLASSES, 1, width, self.act, dropout)
-        self.nn_eta = ffn(decoding_dim + NUM_CLASSES, 1, width, self.act, dropout)
-        self.nn_phi = ffn(decoding_dim + NUM_CLASSES, 1, width, self.act, dropout)
-        self.nn_energy = ffn(decoding_dim + NUM_CLASSES, 1, width, self.act, dropout)
+        self.nn_pt = ffn(decoding_dim + NUM_CLASSES, 1, width, self.act, dropout, ssl)
+        self.nn_eta = ffn(decoding_dim + NUM_CLASSES, 1, width, self.act, dropout, ssl)
+        self.nn_phi = ffn(decoding_dim + NUM_CLASSES, 1, width, self.act, dropout, ssl)
+        self.nn_energy = ffn(decoding_dim + NUM_CLASSES, 1, width, self.act, dropout, ssl)
 
         # elementwise DNN for node charge regression, classes (-1, 0, 1)
-        self.nn_charge = ffn(decoding_dim + NUM_CLASSES, 3, width, self.act, dropout)
+        self.nn_charge = ffn(decoding_dim + NUM_CLASSES, 3, width, self.act, dropout, ssl)
 
     def forward(self, batch):
 
         # unfold the Batch object
-        input_ = batch.x.float()
-        batch_idx = batch.batch
+        if self.ssl:
+            input_ = batch.x.float()[:, : self.input_dim]
+            VICReg_embeddings = batch.x.float()[:, self.input_dim :]
+        else:
+            input_ = batch.x.float()
 
-        embedding = self.nn0(input_)
+        batch_idx = batch.batch
 
         embeddings_id = []
         embeddings_reg = []
 
-        if self.conv_type == "gravnet":
-            # perform a series of graph convolutions
-            for num, conv in enumerate(self.conv_id):
-                conv_input = embedding if num == 0 else embeddings_id[-1]
-                embeddings_id.append(conv(conv_input, batch_idx))
-            for num, conv in enumerate(self.conv_reg):
-                conv_input = embedding if num == 0 else embeddings_reg[-1]
-                embeddings_reg.append(conv(conv_input, batch_idx))
-        elif self.conv_type == "attention":
-            for num, conv in enumerate(self.conv_id):
-                conv_input = embedding if num == 0 else embeddings_id[-1]
-                input_padded, mask = torch_geometric.utils.to_dense_batch(conv_input, batch_idx)
-                out_padded = conv(input_padded, ~mask)
-                out_stacked = torch.cat([out_padded[i][mask[i]] for i in range(out_padded.shape[0])])
-                assert out_stacked.shape[0] == conv_input.shape[0]
-                embeddings_id.append(out_stacked)
-            for num, conv in enumerate(self.conv_reg):
-                conv_input = embedding if num == 0 else embeddings_reg[-1]
-                input_padded, mask = torch_geometric.utils.to_dense_batch(conv_input, batch_idx)
-                out_padded = conv(input_padded, ~mask)
-                out_stacked = torch.cat([out_padded[i][mask[i]] for i in range(out_padded.shape[0])])
-                assert out_stacked.shape[0] == conv_input.shape[0]
-                embeddings_reg.append(out_stacked)
-
-        embedding_id = torch.cat([input_] + embeddings_id, axis=-1)
+        if self.num_convs != 0:
+            embedding = self.nn0(input_)
+
+            if self.conv_type == "gravnet":
+                # perform a series of graph convolutions
+                for num, conv in enumerate(self.conv_id):
+                    conv_input = embedding if num == 0 else embeddings_id[-1]
+                    embeddings_id.append(conv(conv_input, batch_idx))
+                for num, conv in enumerate(self.conv_reg):
+                    conv_input = embedding if num == 0 else embeddings_reg[-1]
+                    embeddings_reg.append(conv(conv_input, batch_idx))
+            elif self.conv_type == "attention":
+                for num, conv in enumerate(self.conv_id):
+                    conv_input = embedding if num == 0 else embeddings_id[-1]
+                    input_padded, mask = torch_geometric.utils.to_dense_batch(conv_input, batch_idx)
+                    out_padded = conv(input_padded, ~mask)
+                    out_stacked = torch.cat([out_padded[i][mask[i]] for i in range(out_padded.shape[0])])
+                    assert out_stacked.shape[0] == conv_input.shape[0]
+                    embeddings_id.append(out_stacked)
+                for num, conv in enumerate(self.conv_reg):
+                    conv_input = embedding if num == 0 else embeddings_reg[-1]
+                    input_padded, mask = torch_geometric.utils.to_dense_batch(conv_input, batch_idx)
+                    out_padded = conv(input_padded, ~mask)
+                    out_stacked = torch.cat([out_padded[i][mask[i]] for i in range(out_padded.shape[0])])
+                    assert out_stacked.shape[0] == conv_input.shape[0]
+                    embeddings_reg.append(out_stacked)
+
+        if self.ssl:
+            embedding_id = torch.cat([input_] + embeddings_id + [VICReg_embeddings], axis=-1)
+        else:
+            embedding_id = torch.cat([input_] + embeddings_id, axis=-1)
 
         # predict the PIDs
         preds_id = self.nn_id(embedding_id)
 
-        embedding_reg = torch.cat([input_] + embeddings_reg + [preds_id], axis=-1)
+        if self.ssl:
+            embedding_reg = torch.cat([input_] + embeddings_reg + [preds_id] + [VICReg_embeddings], axis=-1)
+        else:
+            embedding_reg = torch.cat([input_] + embeddings_reg + [preds_id], axis=-1)
 
         # predict the 4-momentum, add it to the (pt, eta, phi, E) of the PFelement
         preds_pt = self.nn_pt(embedding_reg) + input_[:, 1:2]