normalize loss, reparametrize network (jpata#297)

* trainable configurable * added additional plots * switch to 1.7.1, relu * pin tensorflow * reduce pipeline net size * remove TF from pipeline --------- Co-authored-by: Joosep Pata <joosep.pata@kbfi.ee>
farakiko · Mar 22, 2024 · e1b439a · e1b439a
1 parent 8d9065c
commit e1b439a
Show file tree

Hide file tree

Showing 23 changed files with 2,214 additions and 318 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -28,68 +28,68 @@ jobs:
           python-version: "3.10.12"
           cache: "pip"
       - run: pip install -r requirements.txt
-      - run: pip3 install torch==2.2.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-      - run: pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv torch_geometric -f https://data.pyg.org/whl/torch-2.2.0+cpu.html
-
-  tf-unittests:
-    runs-on: ubuntu-22.04
-    needs: [deps]
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
-        with:
-          python-version: "3.10.12"
-          cache: "pip"
-      - run: pip install -r requirements.txt
-      - run: PYTHONPATH=. python3 -m unittest tests/test_tf.py
+      - run: pip3 install torch==2.2.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+      - run: pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv torch_geometric -f https://data.pyg.org/whl/torch-2.2.1+cpu.html
 
-  tf-clic-pipeline:
-    runs-on: ubuntu-22.04
-    needs: [tf-unittests]
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
-        with:
-          python-version: "3.10.12"
-          cache: "pip"
-      - run: pip install -r requirements.txt
-      - run: ./scripts/local_test_clic_pipeline.sh
-
-  tf-clic-hits-pipeline:
-    runs-on: ubuntu-22.04
-    needs: [tf-unittests]
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
-        with:
-          python-version: "3.10.12"
-          cache: "pip"
-      - run: pip install -r requirements.txt
-      - run: ./scripts/local_test_clic_hits_pipeline.sh
-
-  tf-delphes-pipeline:
-    runs-on: ubuntu-22.04
-    needs: [tf-unittests]
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
-        with:
-          python-version: "3.10.12"
-          cache: "pip"
-      - run: pip install -r requirements.txt
-      - run: ./scripts/local_test_delphes_pipeline.sh
-
-  tf-cms-pipeline:
-    runs-on: ubuntu-22.04
-    needs: [tf-unittests]
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
-        with:
-          python-version: "3.10.12"
-          cache: "pip"
-      - run: pip install -r requirements.txt
-      - run: ./scripts/local_test_cms_pipeline.sh
+#  tf-unittests:
+#    runs-on: ubuntu-22.04
+#    needs: [deps]
+#    steps:
+#      - uses: actions/checkout@v3
+#      - uses: actions/setup-python@v4
+#        with:
+#          python-version: "3.10.12"
+#          cache: "pip"
+#      - run: pip install -r requirements.txt
+#      - run: PYTHONPATH=. python3 -m unittest tests/test_tf.py
+#
+#  tf-clic-pipeline:
+#    runs-on: ubuntu-22.04
+#    needs: [tf-unittests]
+#    steps:
+#      - uses: actions/checkout@v3
+#      - uses: actions/setup-python@v4
+#        with:
+#          python-version: "3.10.12"
+#          cache: "pip"
+#      - run: pip install -r requirements.txt
+#      - run: ./scripts/local_test_clic_pipeline.sh
+#
+#  tf-clic-hits-pipeline:
+#    runs-on: ubuntu-22.04
+#    needs: [tf-unittests]
+#    steps:
+#      - uses: actions/checkout@v3
+#      - uses: actions/setup-python@v4
+#        with:
+#          python-version: "3.10.12"
+#          cache: "pip"
+#      - run: pip install -r requirements.txt
+#      - run: ./scripts/local_test_clic_hits_pipeline.sh
+#
+#  tf-delphes-pipeline:
+#    runs-on: ubuntu-22.04
+#    needs: [tf-unittests]
+#    steps:
+#      - uses: actions/checkout@v3
+#      - uses: actions/setup-python@v4
+#        with:
+#          python-version: "3.10.12"
+#          cache: "pip"
+#      - run: pip install -r requirements.txt
+#      - run: ./scripts/local_test_delphes_pipeline.sh
+#
+#  tf-cms-pipeline:
+#    runs-on: ubuntu-22.04
+#    needs: [tf-unittests]
+#    steps:
+#      - uses: actions/checkout@v3
+#      - uses: actions/setup-python@v4
+#        with:
+#          python-version: "3.10.12"
+#          cache: "pip"
+#      - run: pip install -r requirements.txt
+#      - run: ./scripts/local_test_cms_pipeline.sh
 
   pyg-unittests:
     runs-on: ubuntu-22.04

diff --git a/mlpf/data_cms/genjob_nopu.sh b/mlpf/data_cms/genjob_nopu.sh
@@ -14,7 +14,7 @@ MLPF_PATH=/home/joosep/particleflow/
 SAMPLE=$1
 SEED=$2
 
-WORKDIR=/scratch/local/joosep/$SAMPLE/$SEED
+WORKDIR=/scratch/local/joosep/$SLURM_JOBID/$SAMPLE/$SEED
 #WORKDIR=`pwd`/$SAMPLE/$SEED
 mkdir -p $WORKDIR
 mkdir -p $OUTDIR

diff --git a/mlpf/data_cms/genjob_pu55to75.sh b/mlpf/data_cms/genjob_pu55to75.sh
@@ -14,7 +14,7 @@ MLPF_PATH=/home/joosep/particleflow/
 SAMPLE=$1
 SEED=$2
 
-WORKDIR=/scratch/local/joosep/$SAMPLE/$SEED
+WORKDIR=/scratch/local/joosep/$SLURM_JOBID/$SAMPLE/$SEED
 #WORKDIR=`pwd`/$SAMPLE/$SEED
 mkdir -p $WORKDIR
 mkdir -p $OUTDIR

diff --git a/mlpf/data_cms/prepare_args.py b/mlpf/data_cms/prepare_args.py
@@ -14,7 +14,7 @@
     ("ZpTT_1500_14TeV_TuneCP5_cfi",                            600000, 605010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
 
     ("TTbar_14TeV_TuneCUETP8M1_cfi",                           700000, 701000, "genjob_nopu.sh", outdir + "/nopu"),
-    ("MultiParticlePFGun50_cfi",                               800000, 810000, "genjob_nopu.sh", outdir + "/nopu"),
+    ("MultiParticlePFGun50_cfi",                               800000, 850000, "genjob_nopu.sh", outdir + "/nopu"),
 
     ("SingleElectronFlatPt1To1000_pythia8_cfi",                900000, 910000, "genjob_nopu.sh", outdir + "/nopu"),
     ("SingleGammaFlatPt1To1000_pythia8_cfi",                  1000000,1010000, "genjob_nopu.sh", outdir + "/nopu"),

diff --git a/mlpf/heptfds/cms_pf/multiparticlegun.py b/mlpf/heptfds/cms_pf/multiparticlegun.py
@@ -21,11 +21,12 @@
 class CmsPfMultiParticleGun(tfds.core.GeneratorBasedBuilder):
     """DatasetBuilder for cms_pf_multi_particle_gun dataset."""
 
-    VERSION = tfds.core.Version("1.7.0")
+    VERSION = tfds.core.Version("1.7.1")
     RELEASE_NOTES = {
         "1.6.0": "Initial release",
         "1.6.1": "Additional stats",
         "1.7.0": "Add cluster shape vars",
+        "1.7.1": "Additional stats",
     }
     MANUAL_DOWNLOAD_INSTRUCTIONS = """
     rsync -r --progress \

diff --git a/mlpf/jet_utils.py b/mlpf/jet_utils.py
@@ -1,4 +1,5 @@
 import numpy as np
+
 import numba
 import awkward
 import vector

diff --git a/mlpf/pyg/PFDataset.py b/mlpf/pyg/PFDataset.py
@@ -103,19 +103,14 @@ def __init__(
         )
 
 
-def next_power_of_2(x):
-    return 1 if x == 0 else 2 ** (x - 1).bit_length()
-
-
 class Collater:
     """Based on the Collater found on torch_geometric docs we build our own."""
 
-    def __init__(self, keys_to_get, follow_batch=None, exclude_keys=None, pad_3d=True, pad_power_of_two=True):
+    def __init__(self, keys_to_get, follow_batch=None, exclude_keys=None, pad_3d=True):
         self.follow_batch = follow_batch
         self.exclude_keys = exclude_keys
         self.keys_to_get = keys_to_get
         self.pad_3d = pad_3d
-        self.pad_power_of_two = pad_power_of_two
 
     def __call__(self, inputs):
         num_samples_in_batch = len(inputs)
@@ -133,16 +128,7 @@ def __call__(self, inputs):
         if not self.pad_3d:
             return ret
         else:
-            # pad to closest power of two
-            if self.pad_power_of_two:
-                sizes = [next_power_of_2(len(b.X)) for b in batch]
-                max_size = max(sizes)
-            else:
-                max_size = None
-            ret = {
-                k: torch_geometric.utils.to_dense_batch(getattr(ret, k), ret.batch, max_num_nodes=max_size)
-                for k in elem_keys
-            }
+            ret = {k: torch_geometric.utils.to_dense_batch(getattr(ret, k), ret.batch) for k in elem_keys}
 
             ret["mask"] = ret["X"][1]
 
@@ -164,6 +150,8 @@ def __init__(self, data_loaders):
         max_loader_size = max([len(dl) for dl in data_loaders])
 
         self.loader_ds_indices = []
+
+        # iterate loaders interleaved
         for i in range(max_loader_size):
             for iloader, loader in enumerate(data_loaders):
                 if i < len(loader):
@@ -198,7 +186,7 @@ def __len__(self):
             return len_
 
 
-def get_interleaved_dataloaders(world_size, rank, config, use_cuda, pad_3d, pad_power_of_two, use_ray):
+def get_interleaved_dataloaders(world_size, rank, config, use_cuda, pad_3d, use_ray):
     loaders = {}
     for split in ["train", "valid"]:  # build train, valid dataset and dataloaders
         loaders[split] = []
@@ -232,12 +220,13 @@ def get_interleaved_dataloaders(world_size, rank, config, use_cuda, pad_3d, pad_
             loader = PFDataLoader(
                 dataset,
                 batch_size=batch_size,
-                collate_fn=Collater(["X", "ygen"], pad_3d=pad_3d, pad_power_of_two=pad_power_of_two),
+                collate_fn=Collater(["X", "ygen"], pad_3d=pad_3d),
                 sampler=sampler,
                 num_workers=config["num_workers"],
                 prefetch_factor=config["prefetch_factor"],
                 pin_memory=use_cuda,
                 pin_memory_device="cuda:{}".format(rank) if use_cuda else "",
+                drop_last=True,
             )
 
             if use_ray:

diff --git a/mlpf/pyg/inference.py b/mlpf/pyg/inference.py
@@ -14,10 +14,16 @@
     compute_met_and_ratio,
     format_dataset_name,
     load_eval_data,
+    plot_jets,
     plot_jet_ratio,
+    plot_jet_response_binned,
+    plot_jet_response_binned_eta,
+    plot_jet_response_binned_separate,
     plot_met,
     plot_met_ratio,
+    plot_met_response_binned,
     plot_num_elements,
+    plot_particle_multiplicity,
     plot_particles,
     plot_sum_energy,
 )
@@ -29,11 +35,15 @@
 
 
 def predict_one_batch(conv_type, model, i, batch, rank, jetdef, jet_ptcut, jet_match_dr, outpath, dir_name, sample):
+    outfile = f"{outpath}/preds{dir_name}/{sample}/pred_{rank}_{i}.parquet"
+    if os.path.isfile(outfile):
+        return
+
     if conv_type != "gravnet":
         X_pad, mask = torch_geometric.utils.to_dense_batch(batch.X, batch.batch)
         batch_pad = Batch(X=X_pad, mask=mask).to(rank)
         ypred = model(batch_pad.X, batch_pad.mask)
-        ypred = ypred[0][mask], ypred[1][mask], ypred[2][mask]
+        ypred = ypred[0][mask], ypred[1][mask]
     else:
         _batch = batch.to(rank)
         ypred = model(_batch.X, _batch.batch)
@@ -125,9 +135,13 @@ def predict_one_batch(conv_type, model, i, batch, rank, jetdef, jet_ptcut, jet_m
                 "matched_jets": matched_jets,
             }
         ),
-        f"{outpath}/preds{dir_name}/{sample}/pred_{rank}_{i}.parquet",
+        outfile,
     )
-    _logger.info(f"Saved predictions at {outpath}/preds{dir_name}/{sample}/pred_{rank}_{i}.parquet")
+    _logger.info(f"Saved predictions at {outfile}")
+
+
+def predict_one_batch_args(args):
+    predict_one_batch(*args)
 
 
 @torch.no_grad()
@@ -165,13 +179,61 @@ def make_plots(outpath, sample, dataset, dir_name=""):
 
     yvals, X, _ = load_eval_data(str(pred_path / "*.parquet"), -1)
 
-    plot_num_elements(X, cp_dir=plots_path, title=format_dataset_name(sample))
-    plot_sum_energy(yvals, CLASS_NAMES[dataset], cp_dir=plots_path, title=format_dataset_name(sample))
+    title = format_dataset_name(sample)
+    plot_num_elements(X, cp_dir=plots_path, title=title)
+    plot_sum_energy(yvals, CLASS_NAMES[dataset], cp_dir=plots_path, title=title)
+    plot_particle_multiplicity(X, yvals, CLASS_NAMES[dataset], cp_dir=plots_path, title=title)
 
-    plot_jet_ratio(yvals, cp_dir=plots_path, title=format_dataset_name(sample))
+    plot_jets(
+        yvals,
+        cp_dir=plots_path,
+        title=title,
+    )
+    plot_jet_ratio(
+        yvals,
+        cp_dir=plots_path,
+        title=title,
+        bins=np.linspace(0, 5, 100),
+        logy=True,
+    )
+    plot_jet_ratio(
+        yvals,
+        cp_dir=plots_path,
+        title=title,
+        bins=np.linspace(0.5, 1.5, 100),
+        logy=False,
+        file_modifier="_bins_0p5_1p5",
+    )
+    plot_jet_response_binned(yvals, cp_dir=plots_path, title=title)
+    plot_jet_response_binned_eta(yvals, cp_dir=plots_path, title=title)
+    plot_jet_response_binned_separate(yvals, cp_dir=plots_path, title=title)
 
     met_data = compute_met_and_ratio(yvals)
-    plot_met(met_data, cp_dir=plots_path, title=format_dataset_name(sample))
-    plot_met_ratio(met_data, cp_dir=plots_path, title=format_dataset_name(sample))
+    plot_met(met_data, cp_dir=plots_path, title=title)
+    plot_met_ratio(met_data, cp_dir=plots_path, title=title)
+    plot_met_ratio(
+        met_data,
+        cp_dir=plots_path,
+        title=title,
+        bins=np.linspace(0, 20, 100),
+        logy=True,
+    )
+    plot_met_ratio(
+        met_data,
+        cp_dir=plots_path,
+        title=title,
+        bins=np.linspace(0, 2, 100),
+        logy=False,
+        file_modifier="_bins_0_2",
+    )
+    plot_met_ratio(
+        met_data,
+        cp_dir=plots_path,
+        title=title,
+        bins=np.linspace(0, 5, 100),
+        logy=False,
+        file_modifier="_bins_0_5",
+    )
+    plot_met_response_binned(met_data, cp_dir=plots_path, title=title)
 
     plot_particles(yvals, cp_dir=plots_path, title=format_dataset_name(sample))