diff --git a/mlpf/heptfds/clic_pf_edm4hep/ttbar.py b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py index 9a01aa81f..74837eb43 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/ttbar.py +++ b/mlpf/heptfds/clic_pf_edm4hep/ttbar.py @@ -1,6 +1,7 @@ from pathlib import Path import tensorflow as tf +import tensorflow_datasets as tfds from utils_edm import ( X_FEATURES_CL, X_FEATURES_TRK, @@ -9,8 +10,6 @@ split_sample, ) -import tensorflow_datasets as tfds - _DESCRIPTION = """ CLIC EDM4HEP dataset with ee -> ttbar at 380GeV. - X: reconstructed tracks and clusters, variable number N per event @@ -26,7 +25,7 @@ class ClicEdmTtbarPf(tfds.core.GeneratorBasedBuilder): - VERSION = tfds.core.Version("2.1.0") + VERSION = tfds.core.Version("2.2.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "update stats, move to 380 GeV", @@ -36,6 +35,7 @@ class ClicEdmTtbarPf(tfds.core.GeneratorBasedBuilder): "1.5.0": "Regenerate with ARRAY_RECORD", "2.0.0": "Add ispu, genjets, genmet; disable genjet_idx; truth def not based on gp.status==1", "2.1.0": "Bump dataset size", + "2.2.0": "Additional cluster input features", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ For the raw input files in ROOT EDM4HEP format, please see the citation above. diff --git a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py index b0f152d9c..68e0b610e 100644 --- a/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py +++ b/mlpf/heptfds/clic_pf_edm4hep/utils_edm.py @@ -1,6 +1,7 @@ +import random + import awkward as ak import numpy as np -import random # from fcc/postprocessing.py X_FEATURES_TRK = [ @@ -39,6 +40,16 @@ "sigma_x", "sigma_y", "sigma_z", + # additional cluster input features + "energyError", + "sigma_energy", + "sigma_x_weighted", + "sigma_y_weighted", + "sigma_z_weighted", + "energy_weighted_width", + "pos_shower_max", + "width_shower_max", + "energy_shower_max", ] Y_FEATURES = ["PDG", "charge", "pt", "eta", "sin_phi", "cos_phi", "energy", "ispu"] diff --git a/mlpf/pyg/mlpf.py b/mlpf/pyg/mlpf.py index 59d7564ab..163b880e2 100644 --- a/mlpf/pyg/mlpf.py +++ b/mlpf/pyg/mlpf.py @@ -1,13 +1,13 @@ +import math + +import numpy as np import torch import torch.nn as nn - -from .gnn_lsh import CombinedGraphLayer - from pyg.logger import _logger -import math -import numpy as np from torch.nn.attention import SDPBackend, sdpa_kernel +from .gnn_lsh import CombinedGraphLayer + def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0): # From https://github.com/rwightman/pytorch-image-models/blob/ @@ -57,6 +57,32 @@ def norm_cdf(x): return tensor +def standardize_input(X, elemtypes_nonzero, standardization_dict): + + for i, ielem in enumerate(elemtypes_nonzero): + + Xfeat_normed_msked = X.clone() + + # get mean/std of features of that elem + mean = torch.tensor(standardization_dict[f"PFelement{ielem}"]["mean"]).to(Xfeat_normed_msked.device) + std = torch.tensor(standardization_dict[f"PFelement{ielem}"]["std"]).to(Xfeat_normed_msked.device) + + # standardize + Xfeat_normed_msked[..., 1:] = (Xfeat_normed_msked[..., 1:] - mean[..., 1:]) / std[..., 1:] + + # msk other elements + msk = Xfeat_normed_msked[..., 0:1] == ielem + Xfeat_normed_msked = Xfeat_normed_msked * msk + Xfeat_normed_msked = torch.nan_to_num(Xfeat_normed_msked, nan=0.0) + + if i == 0: + Xfeat_normed = Xfeat_normed_msked + else: + Xfeat_normed += Xfeat_normed_msked + + return Xfeat_normed + + def get_activation(activation): if activation == "elu": act = nn.ELU @@ -372,9 +398,12 @@ def __init__( self.final_norm_reg = torch.nn.LayerNorm(embed_dim) # @torch.compile - def forward(self, X_features, mask): + def forward(self, X_features, mask, standardization_dict=None): Xfeat_normed = X_features + if standardization_dict is not None: + Xfeat_normed = standardize_input(X_features, self.elemtypes_nonzero, standardization_dict) + embeddings_id, embeddings_reg = [], [] if self.num_convs != 0: if self.input_encoding == "joint": diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index 7c2a67e35..2e1432125 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -1,58 +1,57 @@ +import csv +import glob +import json +import logging import os import os.path as osp import pickle as pkl +import shutil import time +from datetime import datetime from pathlib import Path from tempfile import TemporaryDirectory from typing import Optional -import logging -import shutil -from datetime import datetime -import tqdm -import yaml -import csv -import json -import sklearn -import sklearn.metrics -import numpy as np -import pandas + +import fastjet import matplotlib import matplotlib.pyplot as plt -import glob +import numpy as np +import pandas -# comet needs to be imported before torch -from comet_ml import OfflineExperiment, Experiment # noqa: F401, isort:skip +# import sklearn +# import sklearn.metrics import torch import torch.distributed as dist import torch.multiprocessing as mp -from torch import Tensor, nn -from torch.nn import functional as F -from torch.profiler import ProfilerActivity, profile, record_function -from torch.utils.tensorboard import SummaryWriter - -from pyg.logger import _logger, _configLogger +import tqdm +import yaml +from pyg.inference import make_plots, run_predictions +from pyg.logger import _configLogger, _logger +from pyg.mlpf import MLPF, set_save_attention +from pyg.PFDataset import Collater, PFDataset, get_interleaved_dataloaders from pyg.utils import ( - unpack_predictions, - unpack_target, + CLASS_LABELS, + ELEM_TYPES_NONZERO, + X_FEATURES, + count_parameters, + get_input_standardization, + get_lr_schedule, get_model_state_dict, load_checkpoint, save_checkpoint, - CLASS_LABELS, - X_FEATURES, - ELEM_TYPES_NONZERO, save_HPs, - get_lr_schedule, - count_parameters, + unpack_predictions, + unpack_target, ) +from torch import Tensor, nn +from torch.nn import functional as F +from torch.profiler import ProfilerActivity, profile, record_function +from torch.utils.tensorboard import SummaryWriter +from utils import create_comet_experiment +# comet needs to be imported before torch +from comet_ml import OfflineExperiment, Experiment # noqa: F401, isort:skip -import fastjet -from pyg.inference import make_plots, run_predictions - -from pyg.mlpf import set_save_attention -from pyg.mlpf import MLPF -from pyg.PFDataset import Collater, PFDataset, get_interleaved_dataloaders -from utils import create_comet_experiment # Ignore divide by 0 errors np.seterr(divide="ignore", invalid="ignore") @@ -74,7 +73,7 @@ def sliced_wasserstein_loss(y_pred, y_true, num_projections=200): return ret -def mlpf_loss(y, ypred, batch): +def mlpf_loss(y, ypred, batch, epoch): """ Args y [dict]: relevant keys are "cls_id, momentum, charge" @@ -436,6 +435,7 @@ def train_and_valid( dtype=torch.float32, tensorboard_writer=None, save_attention=False, + standardization_dict=None, ): """ Performs training over a given epoch. Will run a validation step every N_STEPS and after the last training batch. @@ -464,10 +464,10 @@ def train_and_valid( loss_accum = 0.0 val_freq_time_0 = time.time() - if not is_train: - cm_X_gen = np.zeros((13, 13)) - cm_X_pred = np.zeros((13, 13)) - cm_id = np.zeros((13, 13)) + # if not is_train: + # cm_X_gen = np.zeros((13, 13)) + # cm_X_pred = np.zeros((13, 13)) + # cm_id = np.zeros((13, 13)) for itrain, batch in iterator: set_save_attention(model, outdir, False) @@ -480,37 +480,43 @@ def train_and_valid( with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"): if is_train: - ypred_raw = model(batch.X, batch.mask) + ypred_raw = model(batch.X, batch.mask, standardization_dict) else: with torch.no_grad(): # save some attention matrices if save_attention and (rank == 0 or rank == "cpu") and itrain == 0: set_save_attention(model, outdir, True) - ypred_raw = model(batch.X, batch.mask) + ypred_raw = model(batch.X, batch.mask, standardization_dict) ypred = unpack_predictions(ypred_raw) - if not is_train: - cm_X_gen += sklearn.metrics.confusion_matrix( - batch.X[:, :, 0][batch.mask].detach().cpu().numpy(), ygen["cls_id"][batch.mask].detach().cpu().numpy(), labels=range(13) - ) - cm_X_pred += sklearn.metrics.confusion_matrix( - batch.X[:, :, 0][batch.mask].detach().cpu().numpy(), ypred["cls_id"][batch.mask].detach().cpu().numpy(), labels=range(13) - ) - cm_id += sklearn.metrics.confusion_matrix( - ygen["cls_id"][batch.mask].detach().cpu().numpy(), ypred["cls_id"][batch.mask].detach().cpu().numpy(), labels=range(13) - ) - # save the events of the first validation batch for quick checks - if (rank == 0 or rank == "cpu") and itrain == 0: - validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, outdir) + # if not is_train: + # cm_X_gen += sklearn.metrics.confusion_matrix( + # batch.X[:, :, 0][batch.mask].detach().cpu().numpy(), + # ygen["cls_id"][batch.mask].detach().cpu().numpy(), + # labels=range(13), + # ) + # cm_X_pred += sklearn.metrics.confusion_matrix( + # batch.X[:, :, 0][batch.mask].detach().cpu().numpy(), + # ypred["cls_id"][batch.mask].detach().cpu().numpy(), + # labels=range(13), + # ) + # cm_id += sklearn.metrics.confusion_matrix( + # ygen["cls_id"][batch.mask].detach().cpu().numpy(), + # ypred["cls_id"][batch.mask].detach().cpu().numpy(), + # labels=range(13), + # ) + # # save the events of the first validation batch for quick checks + # if (rank == 0 or rank == "cpu") and itrain == 0: + # validation_plots(batch, ypred_raw, ygen, ypred, tensorboard_writer, epoch, outdir) with torch.autocast(device_type=device_type, dtype=dtype, enabled=device_type == "cuda"): if is_train: - loss = mlpf_loss(ygen, ypred, batch) + loss = mlpf_loss(ygen, ypred, batch, epoch) for param in model.parameters(): param.grad = None else: with torch.no_grad(): - loss = mlpf_loss(ygen, ypred, batch) + loss = mlpf_loss(ygen, ypred, batch, epoch) if is_train: loss["Total"].backward() @@ -602,16 +608,26 @@ def train_and_valid( comet_experiment.log_metrics(intermediate_losses_v, prefix="valid", step=step) val_freq_time_0 = time.time() # reset intermediate validation spacing timer - if not is_train and comet_experiment: - comet_experiment.log_confusion_matrix( - matrix=cm_X_gen, title="Element to target", row_label="X", column_label="target", epoch=epoch, file_name="cm_X_gen.json" - ) - comet_experiment.log_confusion_matrix( - matrix=cm_X_pred, title="Element to pred", row_label="X", column_label="pred", epoch=epoch, file_name="cm_X_pred.json" - ) - comet_experiment.log_confusion_matrix( - matrix=cm_id, title="Target to pred", row_label="gen", column_label="pred", epoch=epoch, file_name="cm_id.json" - ) + # if not is_train and comet_experiment: + # comet_experiment.log_confusion_matrix( + # matrix=cm_X_gen, + # title="Element to target", + # row_label="X", + # column_label="target", + # epoch=epoch, + # file_name="cm_X_gen.json", + # ) + # comet_experiment.log_confusion_matrix( + # matrix=cm_X_pred, + # title="Element to pred", + # row_label="X", + # column_label="pred", + # epoch=epoch, + # file_name="cm_X_pred.json", + # ) + # comet_experiment.log_confusion_matrix( + # matrix=cm_id, title="Target to pred", row_label="gen", column_label="pred", epoch=epoch, file_name="cm_id.json" + # ) num_data = torch.tensor(len(data_loader), device=rank) # sum up the number of steps from all workers @@ -652,6 +668,7 @@ def train_mlpf( comet_step_freq=None, val_freq=None, save_attention=False, + standardization_dict=None, ): """ Will run a full training by calling train(). @@ -713,6 +730,7 @@ def train_mlpf( lr_schedule=lr_schedule, val_freq=val_freq, dtype=dtype, + standardization_dict=standardization_dict, ) prof.export_chrome_trace("trace.json") else: @@ -733,6 +751,7 @@ def train_mlpf( val_freq=val_freq, dtype=dtype, tensorboard_writer=tensorboard_writer_train, + standardization_dict=standardization_dict, ) t_train = time.time() # epoch time excluding validation @@ -753,6 +772,7 @@ def train_mlpf( dtype=dtype, tensorboard_writer=tensorboard_writer_valid, save_attention=save_attention, + standardization_dict=standardization_dict, ) t_valid = time.time() @@ -847,8 +867,6 @@ def train_mlpf( _logger.info( f"Rank {rank}: epoch={epoch} / {num_epochs} " - + f"train_loss={losses_t['Total']:.4f} " - + f"valid_loss={losses_v['Total']:.4f} " + f"stale={stale_epochs} " + f"epoch_train_time={round((t_train-t0)/60, 2)}m " + f"epoch_valid_time={round((t_valid-t_train)/60, 2)}m " @@ -857,6 +875,40 @@ def train_mlpf( color="bold", ) + log_t = ( + losses_t["Regression_pt"] + + losses_t["Regression_eta"] + + losses_t["Regression_sin_phi"] + + losses_t["Regression_cos_phi"] + + losses_t["Regression_energy"] + ) + log_tot = losses_t["Classification"] + losses_t["Classification_binary"] + log_t + + _logger.info( + f"train: loss_total={log_tot:.4f} " + + f"loss_clf={losses_t['Classification']:.4f} " + + f"loss_clfbinary={losses_t['Classification_binary']:.4f} " + + f"loss_reg={log_t:.4f} ", + color="bold", + ) + + log_v = ( + losses_v["Regression_pt"] + + losses_v["Regression_eta"] + + losses_v["Regression_sin_phi"] + + losses_v["Regression_cos_phi"] + + losses_v["Regression_energy"] + ) + log_tot = losses_v["Classification"] + losses_v["Classification_binary"] + log_v + + _logger.info( + f"valid: loss_total={log_tot:.4f} " + + f"loss_clf={losses_v['Classification']:.4f} " + + f"loss_clfbinary={losses_v['Classification_binary']:.4f} " + + f"loss_reg={log_v:.4f} ", + color="bold", + ) + # save separate json files with stats for each epoch, this is robust to crashed-then-resumed trainings history_path = Path(outdir) / "history" history_path.mkdir(parents=True, exist_ok=True) @@ -919,8 +971,17 @@ def run(rank, world_size, config, args, outdir, logfile): model, optimizer = load_checkpoint(checkpoint, model, optimizer) else: # instantiate a new model in the outdir created + + input_dim = len(X_FEATURES[config["dataset"]]) + if config["dataset"] == "clic": + # extract the version of the dataset + for sample in config["test_dataset"]: + if config["test_dataset"][sample]["version"] == "2.2.0": + input_dim = 26 + break + model_kwargs = { - "input_dim": len(X_FEATURES[config["dataset"]]), + "input_dim": input_dim, "num_classes": len(CLASS_LABELS[config["dataset"]]), "input_encoding": config["model"]["input_encoding"], "pt_mode": config["model"]["pt_mode"], @@ -989,6 +1050,13 @@ def run(rank, world_size, config, args, outdir, logfile): last_epoch = -1 if start_epoch == 1 else start_epoch - 1 lr_schedule = get_lr_schedule(config, optimizer, config["num_epochs"], steps_per_epoch, last_epoch) + if config["standardize_input"] is True: + if (rank == 0) or (rank == "cpu"): + _logger.info("Will standardize the input features before running the training") + standardization_dict = get_input_standardization(config["dataset"], loaders["train"]) + else: + standardization_dict = None + train_mlpf( rank, world_size, @@ -1009,6 +1077,7 @@ def run(rank, world_size, config, args, outdir, logfile): comet_step_freq=config["comet_step_freq"], val_freq=config["val_freq"], save_attention=config["save_attention"], + standardization_dict=standardization_dict, ) checkpoint = torch.load(f"{outdir}/best_weights.pth", map_location=torch.device(rank)) @@ -1154,8 +1223,16 @@ def train_ray_trial(config, args, outdir=None): world_rank = ray.train.get_context().get_world_rank() world_size = ray.train.get_context().get_world_size() + input_dim = len(X_FEATURES[config["dataset"]]) + if config["dataset"] == "clic": + # extract the version of the dataset + for sample in config["test_dataset"]: + if config["test_dataset"][sample]["version"] == "2.2.0": + input_dim = 26 + break + model_kwargs = { - "input_dim": len(X_FEATURES[config["dataset"]]), + "input_dim": input_dim, "num_classes": len(CLASS_LABELS[config["dataset"]]), "input_encoding": config["model"]["input_encoding"], "pt_mode": config["model"]["pt_mode"], @@ -1254,6 +1331,7 @@ def train_ray_trial(config, args, outdir=None): comet_step_freq=config["comet_step_freq"], dtype=getattr(torch, config["dtype"]), val_freq=config["val_freq"], + standardization_dict=None, ) @@ -1346,7 +1424,6 @@ def run_hpo(config, args): import ray from ray import tune from ray.train.torch import TorchTrainer - from raytune.pt_search_space import raytune_num_samples, search_space from raytune.utils import get_raytune_schedule, get_raytune_search_alg diff --git a/mlpf/pyg/utils.py b/mlpf/pyg/utils.py index 6ec64c480..a58869439 100644 --- a/mlpf/pyg/utils.py +++ b/mlpf/pyg/utils.py @@ -1,11 +1,11 @@ import json +import logging import pickle as pkl import pandas as pd import torch import torch.utils.data -from torch.optim.lr_scheduler import OneCycleLR, CosineAnnealingLR, ConstantLR -import logging +from torch.optim.lr_scheduler import ConstantLR, CosineAnnealingLR, OneCycleLR # https://github.com/ahlinist/cmssw/blob/1df62491f48ef964d198f574cdfcccfd17c70425/DataFormats/ParticleFlowReco/interface/PFBlockElement.h#L33 # https://github.com/cms-sw/cmssw/blob/master/DataFormats/ParticleFlowCandidate/src/PFCandidate.cc#L254 @@ -328,3 +328,35 @@ def count_parameters(model): ) trainable_params += params return trainable_params, nontrainable_params, table + + +def get_input_standardization(dataset, train_loader, nsubset=10_000): + + standardization_dict = {} + + for ielem in ELEM_TYPES_NONZERO[dataset]: + standardization_dict["PFelement" + str(ielem)] = {} + + tot_events = 0 + for i, batch in enumerate(train_loader): + + tot_events += batch.X.shape[0] + + # remove the first dimension because we will stack all PFelements anyway to compute the mean/std + batch.X = batch.X.view(-1, batch.X.shape[-1]) + + msk = (batch.X[:, 0] == ielem) & (batch.X[:, 0] != 0) # skip 0 padded elements + + if i == 0: + # initialize + concatenated_pfelements = batch.X[msk] + else: + concatenated_pfelements = torch.cat([concatenated_pfelements, batch.X[msk]]) + + if tot_events > nsubset: + break + + standardization_dict["PFelement" + str(ielem)]["mean"] = torch.mean(concatenated_pfelements, axis=0).tolist() + standardization_dict["PFelement" + str(ielem)]["std"] = torch.std(concatenated_pfelements, axis=0).tolist() + + return standardization_dict diff --git a/mlpf/pyg_pipeline.py b/mlpf/pyg_pipeline.py index 4110e2dea..6aa6ab6fd 100644 --- a/mlpf/pyg_pipeline.py +++ b/mlpf/pyg_pipeline.py @@ -88,6 +88,8 @@ ) parser.add_argument("--test-datasets", nargs="+", default=[], help="test samples to process") +parser.add_argument("--standardize-input", action="store_true", default=None, help="will standardize the input features before training") + def get_outdir(resume_training, load): outdir = None diff --git a/parameters/pytorch/pyg-cld.yaml b/parameters/pytorch/pyg-cld.yaml index 204689385..e2353086a 100644 --- a/parameters/pytorch/pyg-cld.yaml +++ b/parameters/pytorch/pyg-cld.yaml @@ -1,5 +1,6 @@ backend: pytorch +standardize_input: False dataset: cld sort_data: no data_dir: diff --git a/parameters/pytorch/pyg-clic-allsamples.yaml b/parameters/pytorch/pyg-clic-allsamples.yaml new file mode 100644 index 000000000..9b240c9fa --- /dev/null +++ b/parameters/pytorch/pyg-clic-allsamples.yaml @@ -0,0 +1,141 @@ +backend: pytorch + +standardize_input: False +save_attention: yes +dataset: clic +sort_data: no +data_dir: +gpus: 1 +gpu_batch_multiplier: 1 +load: +num_epochs: 100 +patience: 20 +lr: 0.0001 +lr_schedule: cosinedecay # constant, cosinedecay, onecycle +conv_type: attention # gnn_lsh, attention, mamba, flashattention +ntrain: +ntest: +nvalid: +num_workers: 0 +prefetch_factor: +checkpoint_freq: +comet_name: particleflow-pt +comet_offline: False +comet_step_freq: 100 +dtype: float32 +val_freq: # run an extra validation run every val_freq training steps + +model: + trainable: all + learned_representation_mode: last #last, concat + input_encoding: split #split, joint + pt_mode: direct-elemtype-split + eta_mode: linear + sin_phi_mode: linear + cos_phi_mode: linear + energy_mode: direct-elemtype-split + + gnn_lsh: + conv_type: gnn_lsh + embedding_dim: 512 + width: 512 + num_convs: 8 + activation: "elu" + # gnn-lsh specific parameters + bin_size: 32 + max_num_bins: 200 + distance_dim: 128 + layernorm: True + num_node_messages: 2 + ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 2 + + attention: + conv_type: attention + num_convs: 4 + dropout_ff: 0.0 + dropout_conv_id_mha: 0.0 + dropout_conv_id_ff: 0.0 + dropout_conv_reg_mha: 0.0 + dropout_conv_reg_ff: 0.0 + activation: "gelu" + head_dim: 32 + num_heads: 32 + attention_type: math + use_pre_layernorm: True + + mamba: + conv_type: mamba + embedding_dim: 128 + width: 128 + num_convs: 2 + dropout: 0.0 + activation: "elu" + # transformer specific paramters + num_heads: 2 + # mamba specific paramters + d_state: 16 + d_conv: 4 + expand: 2 + +lr_schedule_config: + onecycle: + pct_start: 0.3 + +raytune: + local_dir: # Note: please specify an absolute path + sched: # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + clic_edm_qq_pf: + version: 2.1.0 + clic_edm_ww_fullhad_pf: + version: 2.1.0 + clic_edm_zh_tautau_pf: + version: 2.1.0 + clic_edm_z_tautau_pf: + version: 2.1.0 + +valid_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + clic_edm_qq_pf: + version: 2.1.0 + clic_edm_ww_fullhad_pf: + version: 2.1.0 + clic_edm_zh_tautau_pf: + version: 2.1.0 + clic_edm_z_tautau_pf: + version: 2.1.0 + +test_dataset: + clic_edm_ttbar_pf: + version: 2.1.0 + clic_edm_qq_pf: + version: 2.1.0 diff --git a/parameters/pytorch/pyg-clic-hits.yaml b/parameters/pytorch/pyg-clic-hits.yaml index 62b470931..7f6aa796f 100644 --- a/parameters/pytorch/pyg-clic-hits.yaml +++ b/parameters/pytorch/pyg-clic-hits.yaml @@ -1,5 +1,6 @@ backend: pytorch +standardize_input: False dataset: clic data_dir: gpus: 1 diff --git a/parameters/pytorch/pyg-clic-ttbar-21-joint.yaml b/parameters/pytorch/pyg-clic-ttbar-21-joint.yaml new file mode 100644 index 000000000..915c6cb91 --- /dev/null +++ b/parameters/pytorch/pyg-clic-ttbar-21-joint.yaml @@ -0,0 +1,123 @@ +backend: pytorch + +standardize_input: True +save_attention: yes +dataset: clic +sort_data: no +data_dir: +gpus: 1 +gpu_batch_multiplier: 1 +load: +num_epochs: 100 +patience: 20 +lr: 0.0001 +lr_schedule: cosinedecay # constant, cosinedecay, onecycle +conv_type: attention # gnn_lsh, attention, mamba, flashattention +ntrain: +ntest: +nvalid: +num_workers: 0 +prefetch_factor: +checkpoint_freq: +comet_name: particleflow-pt +comet_offline: False +comet_step_freq: 100 +dtype: float32 +val_freq: # run an extra validation run every val_freq training steps + +model: + trainable: all + learned_representation_mode: last #last, concat + input_encoding: joint #split, joint + pt_mode: direct-elemtype-split + eta_mode: linear + sin_phi_mode: linear + cos_phi_mode: linear + energy_mode: direct-elemtype-split + + gnn_lsh: + conv_type: gnn_lsh + embedding_dim: 512 + width: 512 + num_convs: 8 + activation: "elu" + # gnn-lsh specific parameters + bin_size: 32 + max_num_bins: 200 + distance_dim: 128 + layernorm: True + num_node_messages: 2 + ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 2 + + attention: + conv_type: attention + num_convs: 4 + dropout_ff: 0.0 + dropout_conv_id_mha: 0.0 + dropout_conv_id_ff: 0.0 + dropout_conv_reg_mha: 0.0 + dropout_conv_reg_ff: 0.0 + activation: "gelu" + head_dim: 32 + num_heads: 32 + attention_type: math + use_pre_layernorm: True + + mamba: + conv_type: mamba + embedding_dim: 128 + width: 128 + num_convs: 2 + dropout: 0.0 + activation: "elu" + # transformer specific paramters + num_heads: 2 + # mamba specific paramters + d_state: 16 + d_conv: 4 + expand: 2 + +lr_schedule_config: + onecycle: + pct_start: 0.3 + +raytune: + local_dir: # Note: please specify an absolute path + sched: # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + +valid_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + +test_dataset: + clic_edm_ttbar_pf: + version: 2.1.0 diff --git a/parameters/pytorch/pyg-clic-ttbar-21.yaml b/parameters/pytorch/pyg-clic-ttbar-21.yaml new file mode 100644 index 000000000..376f2b461 --- /dev/null +++ b/parameters/pytorch/pyg-clic-ttbar-21.yaml @@ -0,0 +1,123 @@ +backend: pytorch + +standardize_input: True +save_attention: yes +dataset: clic +sort_data: no +data_dir: +gpus: 1 +gpu_batch_multiplier: 1 +load: +num_epochs: 100 +patience: 20 +lr: 0.0001 +lr_schedule: cosinedecay # constant, cosinedecay, onecycle +conv_type: attention # gnn_lsh, attention, mamba, flashattention +ntrain: +ntest: +nvalid: +num_workers: 0 +prefetch_factor: +checkpoint_freq: +comet_name: particleflow-pt +comet_offline: False +comet_step_freq: 100 +dtype: float32 +val_freq: # run an extra validation run every val_freq training steps + +model: + trainable: all + learned_representation_mode: last #last, concat + input_encoding: split #split, joint + pt_mode: direct-elemtype-split + eta_mode: linear + sin_phi_mode: linear + cos_phi_mode: linear + energy_mode: direct-elemtype-split + + gnn_lsh: + conv_type: gnn_lsh + embedding_dim: 512 + width: 512 + num_convs: 8 + activation: "elu" + # gnn-lsh specific parameters + bin_size: 32 + max_num_bins: 200 + distance_dim: 128 + layernorm: True + num_node_messages: 2 + ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 2 + + attention: + conv_type: attention + num_convs: 4 + dropout_ff: 0.0 + dropout_conv_id_mha: 0.0 + dropout_conv_id_ff: 0.0 + dropout_conv_reg_mha: 0.0 + dropout_conv_reg_ff: 0.0 + activation: "gelu" + head_dim: 32 + num_heads: 32 + attention_type: math + use_pre_layernorm: True + + mamba: + conv_type: mamba + embedding_dim: 128 + width: 128 + num_convs: 2 + dropout: 0.0 + activation: "elu" + # transformer specific paramters + num_heads: 2 + # mamba specific paramters + d_state: 16 + d_conv: 4 + expand: 2 + +lr_schedule_config: + onecycle: + pct_start: 0.3 + +raytune: + local_dir: # Note: please specify an absolute path + sched: # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + +valid_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.1.0 + +test_dataset: + clic_edm_ttbar_pf: + version: 2.1.0 diff --git a/parameters/pytorch/pyg-clic-ttbar-22-joint.yaml b/parameters/pytorch/pyg-clic-ttbar-22-joint.yaml new file mode 100644 index 000000000..19c15e8d7 --- /dev/null +++ b/parameters/pytorch/pyg-clic-ttbar-22-joint.yaml @@ -0,0 +1,123 @@ +backend: pytorch + +standardize_input: True +save_attention: yes +dataset: clic +sort_data: no +data_dir: +gpus: 1 +gpu_batch_multiplier: 1 +load: +num_epochs: 100 +patience: 20 +lr: 0.0001 +lr_schedule: cosinedecay # constant, cosinedecay, onecycle +conv_type: attention # gnn_lsh, attention, mamba, flashattention +ntrain: +ntest: +nvalid: +num_workers: 0 +prefetch_factor: +checkpoint_freq: +comet_name: particleflow-pt +comet_offline: False +comet_step_freq: 100 +dtype: float32 +val_freq: # run an extra validation run every val_freq training steps + +model: + trainable: all + learned_representation_mode: last #last, concat + input_encoding: joint #split, joint + pt_mode: direct-elemtype-split + eta_mode: linear + sin_phi_mode: linear + cos_phi_mode: linear + energy_mode: direct-elemtype-split + + gnn_lsh: + conv_type: gnn_lsh + embedding_dim: 512 + width: 512 + num_convs: 8 + activation: "elu" + # gnn-lsh specific parameters + bin_size: 32 + max_num_bins: 200 + distance_dim: 128 + layernorm: True + num_node_messages: 2 + ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 2 + + attention: + conv_type: attention + num_convs: 4 + dropout_ff: 0.0 + dropout_conv_id_mha: 0.0 + dropout_conv_id_ff: 0.0 + dropout_conv_reg_mha: 0.0 + dropout_conv_reg_ff: 0.0 + activation: "gelu" + head_dim: 32 + num_heads: 32 + attention_type: math + use_pre_layernorm: True + + mamba: + conv_type: mamba + embedding_dim: 128 + width: 128 + num_convs: 2 + dropout: 0.0 + activation: "elu" + # transformer specific paramters + num_heads: 2 + # mamba specific paramters + d_state: 16 + d_conv: 4 + expand: 2 + +lr_schedule_config: + onecycle: + pct_start: 0.3 + +raytune: + local_dir: # Note: please specify an absolute path + sched: # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.2.0 + +valid_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.2.0 + +test_dataset: + clic_edm_ttbar_pf: + version: 2.2.0 diff --git a/parameters/pytorch/pyg-clic-ttbar-22.yaml b/parameters/pytorch/pyg-clic-ttbar-22.yaml new file mode 100644 index 000000000..0fc73c684 --- /dev/null +++ b/parameters/pytorch/pyg-clic-ttbar-22.yaml @@ -0,0 +1,123 @@ +backend: pytorch + +standardize_input: True +save_attention: yes +dataset: clic +sort_data: no +data_dir: +gpus: 1 +gpu_batch_multiplier: 1 +load: +num_epochs: 100 +patience: 20 +lr: 0.0001 +lr_schedule: cosinedecay # constant, cosinedecay, onecycle +conv_type: attention # gnn_lsh, attention, mamba, flashattention +ntrain: +ntest: +nvalid: +num_workers: 0 +prefetch_factor: +checkpoint_freq: +comet_name: particleflow-pt +comet_offline: False +comet_step_freq: 100 +dtype: float32 +val_freq: # run an extra validation run every val_freq training steps + +model: + trainable: all + learned_representation_mode: last #last, concat + input_encoding: split #split, joint + pt_mode: direct-elemtype-split + eta_mode: linear + sin_phi_mode: linear + cos_phi_mode: linear + energy_mode: direct-elemtype-split + + gnn_lsh: + conv_type: gnn_lsh + embedding_dim: 512 + width: 512 + num_convs: 8 + activation: "elu" + # gnn-lsh specific parameters + bin_size: 32 + max_num_bins: 200 + distance_dim: 128 + layernorm: True + num_node_messages: 2 + ffn_dist_hidden_dim: 128 + ffn_dist_num_layers: 2 + + attention: + conv_type: attention + num_convs: 4 + dropout_ff: 0.0 + dropout_conv_id_mha: 0.0 + dropout_conv_id_ff: 0.0 + dropout_conv_reg_mha: 0.0 + dropout_conv_reg_ff: 0.0 + activation: "gelu" + head_dim: 32 + num_heads: 32 + attention_type: math + use_pre_layernorm: True + + mamba: + conv_type: mamba + embedding_dim: 128 + width: 128 + num_convs: 2 + dropout: 0.0 + activation: "elu" + # transformer specific paramters + num_heads: 2 + # mamba specific paramters + d_state: 16 + d_conv: 4 + expand: 2 + +lr_schedule_config: + onecycle: + pct_start: 0.3 + +raytune: + local_dir: # Note: please specify an absolute path + sched: # asha, hyperband + search_alg: # bayes, bohb, hyperopt, nevergrad, scikit + default_metric: "val_loss" + default_mode: "min" + # Tune schedule specific parameters + asha: + max_t: 200 + reduction_factor: 4 + brackets: 1 + grace_period: 10 + hyperband: + max_t: 200 + reduction_factor: 4 + hyperopt: + n_random_steps: 10 + nevergrad: + n_random_steps: 10 + +train_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.2.0 + +valid_dataset: + clic: + physical: + batch_size: 1 + samples: + clic_edm_ttbar_pf: + version: 2.2.0 + +test_dataset: + clic_edm_ttbar_pf: + version: 2.2.0 diff --git a/parameters/pytorch/pyg-clic.yaml b/parameters/pytorch/pyg-clic.yaml index a51540683..185368c12 100644 --- a/parameters/pytorch/pyg-clic.yaml +++ b/parameters/pytorch/pyg-clic.yaml @@ -1,5 +1,6 @@ backend: pytorch +standardize_input: False save_attention: yes dataset: clic sort_data: no diff --git a/parameters/pytorch/pyg-cms-finetune.yaml b/parameters/pytorch/pyg-cms-finetune.yaml index b70d3df4a..03f5af6c8 100644 --- a/parameters/pytorch/pyg-cms-finetune.yaml +++ b/parameters/pytorch/pyg-cms-finetune.yaml @@ -1,5 +1,6 @@ backend: pytorch +standardize_input: False dataset: cms sort_data: yes data_dir: diff --git a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml index cfacab525..8485611c4 100644 --- a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml +++ b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml @@ -1,5 +1,6 @@ backend: pytorch +standardize_input: False dataset: cms sort_data: yes data_dir: diff --git a/parameters/pytorch/pyg-cms.yaml b/parameters/pytorch/pyg-cms.yaml index 7d5f7e4a1..7507d848d 100644 --- a/parameters/pytorch/pyg-cms.yaml +++ b/parameters/pytorch/pyg-cms.yaml @@ -1,5 +1,6 @@ backend: pytorch +standardize_input: False save_attention: no dataset: cms sort_data: yes diff --git a/scripts/clic/postprocessing.py b/scripts/clic/postprocessing.py index 54d2a857b..e05b2ae9c 100644 --- a/scripts/clic/postprocessing.py +++ b/scripts/clic/postprocessing.py @@ -1,20 +1,22 @@ +import glob import os -# to prevent https://stackoverflow.com/questions/52026652/openblas-blas-thread-init-pthread-create-resource-temporarily-unavailable +# noqa: to prevent https://stackoverflow.com/questions/52026652/openblas-blas-thread-init-pthread-create-resource-temporarily-unavailable os.environ["OMP_NUM_THREADS"] = "1" os.environ["OPENBLAS_NUM_THREADS"] = "1" os.environ["MKL_NUM_THREADS"] = "1" os.environ["VECLIB_MAXIMUM_THREADS"] = "1" os.environ["NUMEXPR_NUM_THREADS"] = "1" -import numpy as np +import bz2 + import awkward +import fastjet +import numpy as np +import pyhepmc +import tqdm import uproot import vector -import tqdm -import pyhepmc -import bz2 -import fastjet from scipy.sparse import coo_matrix track_coll = "SiTracks_Refitted" @@ -61,6 +63,16 @@ "sigma_x", "sigma_y", "sigma_z", + # additional cluster input features + "energyError", + "sigma_energy", + "sigma_x_weighted", + "sigma_y_weighted", + "sigma_z_weighted", + "energy_weighted_width", + "pos_shower_max", + "width_shower_max", + "energy_shower_max", ] hit_feature_order = [ "elemtype", @@ -311,7 +323,7 @@ def genparticle_track_adj(sitrack_links, iev): def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev): cluster_arr = prop_data["PandoraClusters"][iev] - feats = ["type", "position.x", "position.y", "position.z", "iTheta", "phi", "energy"] + feats = ["type", "position.x", "position.y", "position.z", "iTheta", "phi", "energy", "energyError"] ret = {feat: cluster_arr["PandoraClusters." + feat] for feat in feats} hit_idx = np.array(hit_to_cluster[0]) @@ -324,8 +336,15 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev): cl_sigma_y = [] cl_sigma_z = [] + cl_sigma_energy = [] + cl_sigma_x_weighted, cl_sigma_y_weighted, cl_sigma_z_weighted = [], [], [] + cl_energy_weighted_width = [] + cl_pos_shower_max, cl_energy_shower_max, cl_width_shower_max = [], [], [] + n_cl = len(ret["energy"]) - for cl in range(n_cl): + + # xs, ys, zs, es = [], [], [], [] + for i, cl in enumerate(range(n_cl)): msk_cl = cluster_idx == cl hits = hit_idx[msk_cl] @@ -351,6 +370,46 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev): cl_sigma_y.append(np.std(hits_posy)) cl_sigma_z.append(np.std(hits_posz)) + cl_sigma_energy.append(np.std(hits_energy)) + cl_sigma_x_weighted.append(np.std(hits_posx * hits_energy)) + cl_sigma_y_weighted.append(np.std(hits_posy * hits_energy)) + cl_sigma_z_weighted.append(np.std(hits_posz * hits_energy)) + + # z_bar = np.sum(hits_posz * hits_energy) / np.sum(hits_energy) # energy weighted average + x_bar = np.sum(hits_posx * hits_energy) / np.sum(hits_energy) # energy weighted average + y_bar = np.sum(hits_posy * hits_energy) / np.sum(hits_energy) # energy weighted average + + num = (np.sum(hits_energy * (hits_posx - x_bar) ** 2)) + (np.sum(hits_energy * (hits_posy - y_bar) ** 2)) + den = np.sum(hits_energy) + + cl_energy_weighted_width.append(num / den) + + # get position at shower max + # at each unique "z" integrate the energy of all the hits to find zmax + zmax, emax = 0, -1000 + for z in np.unique(np.array(hits_posz)): + msk = np.array(hits_posz) == z + ez = np.sum(np.array(hits_energy)[msk]) + + if ez > emax: + zmax, emax = z, ez + + cl_pos_shower_max.append(zmax) + cl_energy_shower_max.append(emax) + + # get width at shower max + msk = np.array(hits_posz) == zmax # select the hits at zmax + + x_bar = np.sum(np.array(hits_posx)[msk] * np.array(hits_energy)[msk]) / np.sum(np.array(hits_energy)[msk]) # energy weighted average + y_bar = np.sum(np.array(hits_posy)[msk] * np.array(hits_energy)[msk]) / np.sum(np.array(hits_energy)[msk]) # energy weighted average + + num = (np.sum(np.array(hits_energy)[msk] * (np.array(hits_posx)[msk] - x_bar) ** 2)) + ( + np.sum(np.array(hits_energy)[msk] * (np.array(hits_posy)[msk] - y_bar) ** 2) + ) + den = np.sum(np.array(hits_energy)[msk]) + + cl_width_shower_max.append(num / den) + ret["energy_ecal"] = np.array(cl_energy_ecal) ret["energy_hcal"] = np.array(cl_energy_hcal) ret["energy_other"] = np.array(cl_energy_other) @@ -374,6 +433,17 @@ def cluster_to_features(prop_data, hit_features, hit_to_cluster, iev): ret["sin_phi"] = np.sin(ret["phi"]) ret["cos_phi"] = np.cos(ret["phi"]) + # additional cluster input features + ret["sigma_energy"] = np.array(cl_sigma_energy) + ret["sigma_x_weighted"] = np.array(cl_sigma_x_weighted) + ret["sigma_y_weighted"] = np.array(cl_sigma_y_weighted) + ret["sigma_z_weighted"] = np.array(cl_sigma_z_weighted) + ret["energy_weighted_width"] = np.array(cl_energy_weighted_width) + + ret["pos_shower_max"] = np.array(cl_pos_shower_max) + ret["energy_shower_max"] = np.array(cl_energy_shower_max) + ret["width_shower_max"] = np.array(cl_width_shower_max) + return awkward.Record(ret) @@ -936,16 +1006,25 @@ def parse_args(): import argparse parser = argparse.ArgumentParser() - parser.add_argument("--input", type=str, help="Input file ROOT file", required=True) + parser.add_argument("--input", type=str, help="Input ROOT file - else if dir then will process all files inside", required=True) parser.add_argument("--outpath", type=str, default="raw", help="output path") args = parser.parse_args() return args def process(args): - infile = args.input - outfile = os.path.join(args.outpath, os.path.basename(infile).split(".")[0] + ".parquet") - process_one_file(infile, outfile) + + if os.path.isdir(args.input) is True: + print("Will process all files in " + args.input) + + flist = glob.glob(args.input + "/*.root") + for infile in flist: + outfile = os.path.join(args.outpath, os.path.basename(infile).split(".")[0] + ".parquet") + process_one_file(infile, outfile) + else: + infile = args.input + outfile = os.path.join(args.outpath, os.path.basename(infile).split(".")[0] + ".parquet") + process_one_file(infile, outfile) if __name__ == "__main__":