integrate hep_tfds, September 2022 benchmark training (jpata#136)

* Initial commit * add template dataset definitions * Add initial CMS particle-flow dataset implementation Also changed to a new tensorflow dataset template * add test scripts * Run black formatting on python files * Add instructions to cms_pf, use manual_dir for preprocessing * fix: ability to choose data directory for the tfrecords files * feat: Add Delphes dataset * fix: support loading both .pkl.bz2 and .pkl * fix: remove extra dimension in cms_pf data items * fix cms * fixes for delphes * ensure dir exists * separate cms datasets * clarify manual dir * cleanup print * added singleele and singlemu * update 1.1 * cleanup cms datasets * update datamodel * added new datasets * gen/sim 12_3_0_pre6 generation (#1) * 1.2 format, ztt dataset * version 1.3.0 with new gensim truth * new dataset * add qcd * add some asserts * add new features * keep PS * add tau as pf target * 1.3.1 remove ps and brem (#2) * fix HF labeling (#3) * add new high-PU QCD dataset, update energy * up * fix * Add gen jet index (#4) * first attempt at gen jet clustering * add other reqs * revert test * fix mapping to before masking particles * fix out of index bufg * benchmark training for CMS * move path * move path * remove submodule * remove * move * fix import * format * format * remove some dummy files * up * try with masking * use a different dataset for logging the jet/met distributions * clean * added clic ttbar Co-authored-by: Eric Wulff <eric.g.t.wulff@gmail.com> Co-authored-by: Eric Wulff <eric.wulff@cern.ch> Co-authored-by: Javier Duarte <jduarte@ucsd.edu> Former-commit-id: fb89d79
erwulff · Sep 2, 2022 · 05e14e8 · 05e14e8
1 parent 296edaa
commit 05e14e8
Show file tree

Hide file tree

Showing 45 changed files with 2,127 additions and 961 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -21,7 +21,6 @@ jobs:
       - name: Install python deps
         run: |
           pip install -r requirements.txt
-          pip install ./hep_tfds
           HOROVOD_WITH_TENSORFLOW=1 pip install horovod[tensorflow,keras]
       - name: Run delphes TF model
         run: ./scripts/local_test_delphes_pipeline.sh
@@ -38,7 +37,6 @@ jobs:
       - name: Install python deps
         run: |
           pip install -r requirements.txt
-          pip install ./hep_tfds
           HOROVOD_WITH_TENSORFLOW=1 pip install horovod[tensorflow,keras]
       - name: Run CMS TF model using the pipeline
         run: ./scripts/local_test_cms_pipeline.sh
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "hep_tfds"]
-    path = hep_tfds
-    url = https://github.com/jpata/hep_tfds

diff --git a/hep_tfds b/hep_tfds
diff --git a/mlpf/data_clic/postprocessing.py b/mlpf/data_clic/postprocessing.py
@@ -0,0 +1,235 @@
+import bz2
+import json
+
+import networkx as nx
+import numpy as np
+import pandas
+
+# 12,14,16 are neutrinos.
+neutrinos = [12, 14, 16]
+labels_ys_cand = [0, 211, 130, 22, 11, 13]
+
+# this is what I can reconstruct
+labels_ys_gen = [0, 211, 130, 22, 11, 13]
+
+
+def prepare_data_clic(fn):
+    def map_pdgid_to_candid(pdgid, charge):
+        if pdgid in [0, 22, 11, 13]:
+            return pdgid
+
+        # charged hadron
+        if abs(charge) > 0:
+            return 211
+
+        # neutral hadron
+        return 130
+
+    def track_pt(omega):
+        return a * np.abs(b / omega)
+
+    def track_as_array(df_tr, itr):
+        row = df_tr.loc[itr]
+        return [0, row["px"], row["py"], row["pz"], row["nhits"], row["d0"], row["z0"]]
+
+    def cluster_as_array(df_cl, icl):
+        row = df_cl.loc[icl]
+        return [1, row["x"], row["y"], row["z"], row["nhits_ecal"], row["nhits_hcal"], row["energy"]]
+
+    def gen_as_array(df_gen, igen):
+        if igen:
+            row = df_gen.loc[igen]
+            return np.array([abs(row["pdgid"]), row["charge"], row["px"], row["py"], row["pz"], row["energy"]])
+        else:
+            return np.zeros(6)
+
+    def pf_as_array(df_pfs, igen):
+        if igen:
+            row = df_pfs.loc[igen]
+            return np.array([abs(row["type"]), row["charge"], row["px"], row["py"], row["pz"], row["energy"]])
+        else:
+            return np.zeros(6)
+
+    def filter_gp(gp):
+        row = df_gen.loc[gp]
+        if row["status"] == 1 and row["energy"] > 0.2:
+            return True
+        return False
+
+    def flatten_event(df_tr, df_cl, df_gen, df_pfs, pairs):
+        Xs = []
+        ys_gen = []
+        ys_cand = []
+
+        # find all track-associated particles
+        for itr in range(len(df_tr)):
+
+            k = ("tr", itr)
+            gp = None
+            rp = None
+            if k in pairs:
+                gp = pairs[k][0]
+                rp = pairs[k][1]
+
+            # normalize ysgen and yscand
+            ys = gen_as_array(df_gen, gp)
+            cand = pf_as_array(df_pfs, rp)
+            # skip the neutrinos
+            if (abs(ys[0]) in neutrinos) or (abs(cand[0]) in neutrinos):
+                continue
+            else:
+                ys[0] = labels_ys_gen.index(map_pdgid_to_candid(abs(ys[0]), ys[-1]))
+                cand[0] = labels_ys_cand.index(map_pdgid_to_candid(abs(cand[0]), cand[-1]))
+            ys_gen.append(np.delete(ys, -1))
+            ys_cand.append(np.delete(cand, -1))
+            Xs.append(track_as_array(df_tr, itr))
+
+        # find all cluster-associated particles
+        for icl in range(len(df_cl)):
+
+            k = ("cl", icl)
+            gp = None
+            rp = None
+            if k in pairs:
+                gp = pairs[k][0]
+                rp = pairs[k][1]
+
+            # normalize ysgen and yscand
+            ys = gen_as_array(df_gen, gp)
+            cand = pf_as_array(df_pfs, rp)
+            # skip the neutrinos
+            if (abs(ys[0]) in neutrinos) or (abs(cand[0]) in neutrinos):
+                continue
+            else:
+                ys[0] = labels_ys_gen.index(map_pdgid_to_candid(abs(ys[0]), ys[-1]))
+                cand[0] = labels_ys_cand.index(map_pdgid_to_candid(abs(cand[0]), cand[-1]))
+            # if icl == 5:
+            #  print(ys[0], ys[-1])
+            ys_gen.append(np.delete(ys, -1))
+            ys_cand.append(np.delete(cand, -1))
+            Xs.append(cluster_as_array(df_cl, icl))
+
+        Xs = np.stack(Xs, axis=-1).T
+        ys_gen = np.stack(ys_gen, axis=-1).T
+        # print("ys_gen flatten",ys_gen[:10])
+        ys_cand = np.stack(ys_cand, axis=-1).T
+
+        return Xs, ys_gen, ys_cand
+
+    data = json.load(bz2.BZ2File(fn, "r"))
+    a = 3 * 10**-4
+    b = 5  # B-field in tesla
+
+    ret = []
+    for iev in range(len(data)):
+        df_gen = pandas.DataFrame(data[iev]["genparticles"])
+
+        # df_hit = pandas.DataFrame(data[iev]["track_hits"])
+        df_cl = pandas.DataFrame(data[iev]["clusters"])
+        df_tr = pandas.DataFrame(data[iev]["tracks"])
+        # df_ecal = pandas.DataFrame(data[iev]["ecal_hits"])
+        # df_hcal = pandas.DataFrame(data[iev]["hcal_hits"])
+        df_pfs = pandas.DataFrame(data[iev]["pfs"])
+
+        df_tr["pt"] = track_pt(df_tr["omega"])
+        df_tr["px"] = np.cos(df_tr["phi"]) * df_tr["pt"]
+        df_tr["py"] = np.sin(df_tr["phi"]) * df_tr["pt"]
+        df_tr["pz"] = df_tr["tan_lambda"] * df_tr["pt"]
+
+        matrix_tr_to_gp = np.zeros((len(df_tr), len(df_gen)))
+        matrix_cl_to_gp = np.zeros((len(df_cl), len(df_gen)))
+
+        for itr in range(len(df_tr)):
+            gps = df_tr.loc[itr]["gp_contributions"]
+            for gp, val in gps.items():
+                matrix_tr_to_gp[itr, int(gp)] += val
+
+        for icl in range(len(df_cl)):
+            gps = df_cl.loc[icl]["gp_contributions"]
+            for gp, val in gps.items():
+                matrix_cl_to_gp[icl, int(gp)] += val
+
+        reco_to_pf = {}
+        for ipf in range(len(df_pfs)):
+            row = df_pfs.loc[ipf]
+            if row["track_idx"] != -1:
+                k = ("tr", int(row["track_idx"]))
+                assert not (k in reco_to_pf)
+                reco_to_pf[k] = ipf
+            elif row["cluster_idx"] != -1:
+                k = ("cl", int(row["cluster_idx"]))
+                assert not (k in reco_to_pf)
+                reco_to_pf[k] = ipf
+            else:
+                # PF should always have a track or a cluster associated
+                assert False
+
+        dg = nx.Graph()
+
+        gps = set()
+
+        # loop over clusters, get all genparticles associated to clusters
+        for icl in range(len(df_cl)):
+            row = df_cl.loc[icl]
+            dg.add_node(("cl", icl))
+            for gp, weight in row["gp_contributions"].items():
+                gp = int(gp)
+                if filter_gp(gp):
+                    dg.add_node(("gp", gp))
+                    gps.add(gp)
+                    dg.add_edge(("gp", gp), ("cl", icl), weight=weight)
+
+        # loop over tracks, get all genparticles associated to tracks
+        for itr in range(len(df_tr)):
+            row = df_tr.loc[itr]
+            dg.add_node(("tr", itr))
+            for gp in row["gp_contributions"].keys():
+                gp = int(gp)
+                if filter_gp(gp):
+                    dg.add_node(("gp", gp))
+                    gps.add(gp)
+
+                    # the track is added to the genparticle with a very high weight
+                    # because we always want to associate the genparticle to a track if it's possible
+                    dg.add_edge(("gp", gp), ("tr", itr), weight=9999.0)
+
+        # uniqe genparticles
+        gps = set(gps)
+
+        # now loop over all the genparticles
+        pairs = {}
+        for gp in gps:
+            gp_node = ("gp", gp)
+
+            # find the neighboring reco elements (clusters and tracks)
+            neighbors = list(dg.neighbors(gp_node))
+            weights = [dg.edges[gp_node, n]["weight"] for n in neighbors]
+            nw = zip(neighbors, weights)
+
+            # sort the neighbors by the edge weight (deposited energy)
+            nw = sorted(nw, key=lambda x: x[1], reverse=True)
+            reco_obj = None
+            if len(nw) > 0:
+                # choose the closest neighbor as the "key" reco element
+                reco_obj = nw[0][0]
+
+                # remove the reco element from the list, so it can't be associated to anything else
+                dg.remove_node(reco_obj)
+
+            # this genparticle had a unique reco element
+            if reco_obj:
+                pf_obj = None
+                if reco_obj and reco_obj in reco_to_pf:
+                    pf_obj = reco_to_pf[reco_obj]
+
+                assert not (reco_obj in pairs)
+                pairs[reco_obj] = (gp, pf_obj)
+
+            # this is a case where a genparticle did not have a key reco element, but instead was smeared between others
+            # else:
+            # print("genparticle {} is merged and cannot be reconstructed".format(gp))
+            # print(df_gen.loc[gp])
+
+        Xs, ys_gen, ys_cand = flatten_event(df_tr, df_cl, df_gen, df_pfs, pairs)
+        ret.append((Xs, ys_gen, ys_cand))
+    return ret
diff --git a/mlpf/data/genjob.jdl → mlpf/data_cms/genjob.jdl b/mlpf/data/genjob.jdl → mlpf/data_cms/genjob.jdl
diff --git a/mlpf/data/genjob.sh → mlpf/data_cms/genjob.sh b/mlpf/data/genjob.sh → mlpf/data_cms/genjob.sh
@@ -63,6 +63,6 @@ cmsRun step2_phase1_new.py
 cmsRun step3_phase1_new.py
 cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py
 mv pfntuple.root pfntuple_${SEED}.root
-python3 ${MLPF_PATH}/mlpf/data/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table
+python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table
 bzip2 -z pfntuple_${SEED}.pkl
 #rm step*.root
diff --git a/mlpf/data/genjob_pu.sh → mlpf/data_cms/genjob_pu.sh b/mlpf/data/genjob_pu.sh → mlpf/data_cms/genjob_pu.sh
@@ -13,7 +13,7 @@ WORKDIR=`pwd`/$SAMPLE/$SEED
 mkdir -p $WORKDIR
 
 PILEUP=Run3_Flat55To75_PoissonOOTPU
-PILEUP_INPUT=filelist:${MLPF_PATH}/mlpf/data/pu_files_local.txt
+PILEUP_INPUT=filelist:${MLPF_PATH}/mlpf/data_cms/pu_files_local.txt
 
 N=100
 
@@ -65,6 +65,6 @@ cmsRun step2_phase1_new.py
 cmsRun step3_phase1_new.py
 cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py
 mv pfntuple.root pfntuple_${SEED}.root
-python3 ${MLPF_PATH}/mlpf/data/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table
+python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table
 bzip2 -z pfntuple_${SEED}.pkl
 #rm step*.root
diff --git a/mlpf/data/multicrab.py → mlpf/data_cms/multicrab.py b/mlpf/data/multicrab.py → mlpf/data_cms/multicrab.py
diff --git a/mlpf/data/postprocessing2.py → mlpf/data_cms/postprocessing2.py b/mlpf/data/postprocessing2.py → mlpf/data_cms/postprocessing2.py
diff --git a/mlpf/data/prepare_args.py → mlpf/data_cms/prepare_args.py b/mlpf/data/prepare_args.py → mlpf/data_cms/prepare_args.py
@@ -6,21 +6,22 @@
 outdir = "/hdfs/local/joosep/mlpf/gen/v2"
 
 samples = [
-    #    "SinglePiMinusFlatPt0p7To1000_cfi",
-    #    "SingleGammaFlatPt1To1000_pythia8_cfi",
-    #    "SingleElectronFlatPt1To1000_pythia8_cfi",
-    #    "SingleTauFlatPt1To1000_cfi",
-    #    "SinglePi0Pt1To1000_pythia8_cfi",
-    #    "SingleProtonMinusFlatPt0p7To1000_cfi",
-    #    "SingleNeutronFlatPt0p7To1000_cfi",
-    #    "SingleMuFlatLogPt_100MeVto2TeV_cfi",
+    "SinglePiMinusFlatPt0p7To1000_cfi",
+    "SingleGammaFlatPt1To1000_pythia8_cfi",
+    "SingleElectronFlatPt1To1000_pythia8_cfi",
+    "SingleTauFlatPt1To1000_cfi",
+    "SinglePi0Pt1To1000_pythia8_cfi",
+    "SingleProtonMinusFlatPt0p7To1000_cfi",
+    "SingleNeutronFlatPt0p7To1000_cfi",
+    "SingleMuFlatLogPt_100MeVto2TeV_cfi",
 ]
 
 samples_pu = [
-    "TTbar_14TeV_TuneCUETP8M1_cfi",
-    "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi",
-    "QCDForPF_14TeV_TuneCUETP8M1_cfi",
-    "QCD_Pt_3000_7000_14TeV_TuneCUETP8M1_cfi",
+    # "TTbar_14TeV_TuneCUETP8M1_cfi",
+    # "ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi",
+    # "QCDForPF_14TeV_TuneCUETP8M1_cfi",
+    # "QCD_Pt_3000_7000_14TeV_TuneCUETP8M1_cfi",
+    "SMS-T1tttt_mGl-1500_mLSP-100_TuneCP5_14TeV_pythia8_cfi",
 ]
 
 NUM_SAMPLES = 1000
@@ -31,14 +32,10 @@
     for s in samples_pu + samples:
         is_pu = s in samples_pu
 
-        num = 10
-        if is_pu:
-            num = NUM_SAMPLES
-
         os.makedirs(outdir + "/" + s + "/raw", exist_ok=True)
         os.makedirs(outdir + "/" + s + "/root", exist_ok=True)
 
-        for nsamples in range(num):
+        for nsamples in range(NUM_SAMPLES):
             if not os.path.isfile(outdir + "/" + s + "/raw/pfntuple_{}.pkl.bz2".format(iseed)):
                 if is_pu:
                     print("sbatch mlpf/tallinn/genjob_pu.sh {} {}".format(s, iseed))

diff --git a/mlpf/data/pu_files.txt → mlpf/data_cms/pu_files.txt b/mlpf/data/pu_files.txt → mlpf/data_cms/pu_files.txt
diff --git a/mlpf/data/run_gen.sh → mlpf/data_cms/run_gen.sh b/mlpf/data/run_gen.sh → mlpf/data_cms/run_gen.sh