diff --git a/mlpf/data_cms/genjob_nopu.sh b/mlpf/data_cms/genjob_nopu.sh
index a60c4bda3..c19077759 100755
--- a/mlpf/data_cms/genjob_nopu.sh
+++ b/mlpf/data_cms/genjob_nopu.sh
@@ -6,7 +6,7 @@
 set -e
 set -x
 
-OUTDIR=/local/joosep/mlpf/cms/20240702_cptruthdef/nopu/
+OUTDIR=/local/joosep/mlpf/cms/20240823_simcluster/nopu/
 CMSSWDIR=/scratch/persistent/joosep/CMSSW_14_1_0_pre3
 MLPF_PATH=/home/joosep/particleflow/
 
@@ -22,7 +22,7 @@ mkdir -p $OUTDIR
 PILEUP=NoPileUp
 PILEUP_INPUT=
 
-N=200
+N=100
 
 env
 source /cvmfs/cms.cern.ch/cmsset_default.sh
@@ -73,8 +73,8 @@ ls -lrt
 echo "process.RandomNumberGeneratorService.generator.initialSeed = $SEED" >> step2_phase1_new.py
 cmsRun step2_phase1_new.py > /dev/null
 cmsRun step3_phase1_new.py > /dev/null
-#cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py
 mv pfntuple.root pfntuple_${SEED}.root
+
 python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./
 bzip2 -z pfntuple_${SEED}.pkl
 cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/
@@ -82,6 +82,6 @@ cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/
 #copy ROOT outputs
 #cp step2_phase1_new.root $OUTDIR/$SAMPLE/root/step2_${SEED}.root
 #cp step3_phase1_new.root $OUTDIR/$SAMPLE/root/step3_${SEED}.root
-#cp pfntuple_${SEED}.root $OUTDIR/$SAMPLE/root/
+cp pfntuple_${SEED}.root $OUTDIR/$SAMPLE/root/
 
 rm -Rf /scratch/local/joosep/$SLURM_JOBID
diff --git a/mlpf/data_cms/genjob_pu55to75.sh b/mlpf/data_cms/genjob_pu55to75.sh
index cd5763e41..54ed7c166 100755
--- a/mlpf/data_cms/genjob_pu55to75.sh
+++ b/mlpf/data_cms/genjob_pu55to75.sh
@@ -7,7 +7,7 @@
 set -e
 set -x
 
-OUTDIR=/local/joosep/mlpf/cms/20240702_cptruthdef/pu55to75/
+OUTDIR=/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/
 CMSSWDIR=/scratch/persistent/joosep/CMSSW_14_1_0_pre3
 MLPF_PATH=/home/joosep/particleflow/
 
@@ -77,13 +77,13 @@ cmsRun step2_phase1_new.py > /dev/null
 cmsRun step3_phase1_new.py > /dev/null
 #cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py
 mv pfntuple.root pfntuple_${SEED}.root
-python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./
-bzip2 -z pfntuple_${SEED}.pkl
-cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/
+# python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./
+# bzip2 -z pfntuple_${SEED}.pkl
+# cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/
 
 #copy ROOT outputs
 #cp step2_phase1_new.root $OUTDIR/$SAMPLE/root/step2_${SEED}.root
 #cp step3_phase1_new.root $OUTDIR/$SAMPLE/root/step3_${SEED}.root
-#cp pfntuple_${SEED}.root $OUTDIR/$SAMPLE/root/
+cp pfntuple_${SEED}.root $OUTDIR/$SAMPLE/root/
 
 rm -Rf /scratch/local/joosep/$SLURM_JOBID
diff --git a/mlpf/data_cms/postprocessing2.py b/mlpf/data_cms/postprocessing2.py
index 1a36e4fac..6d1c9e3e6 100644
--- a/mlpf/data_cms/postprocessing2.py
+++ b/mlpf/data_cms/postprocessing2.py
@@ -2,16 +2,13 @@
 import os
 import pickle
 
-import matplotlib
-import matplotlib.pyplot as plt
 import networkx as nx
 import numpy as np
 import tqdm
 import uproot
 import vector
 import awkward
-
-matplotlib.use("Agg")
+from sklearn.neighbors import KDTree
 
 elem_branches = [
     "typ",
@@ -70,7 +67,7 @@
     "phierror4",
 ]
 
-target_branches = ["typ", "charge", "pt", "eta", "sin_phi", "cos_phi", "e", "ispu", "orig_pid"]
+target_branches = ["pid", "charge", "pt", "eta", "sin_phi", "cos_phi", "e", "ispu"]
 
 
 def print_gen(g, min_pt=1):
@@ -87,9 +84,19 @@ def print_gen(g, min_pt=1):
         if g.nodes[node]["pt"] > min_pt:
             print(node, g.nodes[node]["pt"], g.nodes[node]["eta"], g.nodes[node]["phi"], g.nodes[node]["typ"])
 
-    gen_nodes = [n for n in g.nodes if n[0] == "cp" and g.nodes[n]["pt"] > min_pt]
-    for node in gen_nodes:
-        children = [(g.nodes[suc]["typ"], g.edges[node, suc]["weight"]) for suc in g.successors(node)]
+    # sc_nodes = [n for n in g.nodes if n[0] == "sc"]
+    # for node in sc_nodes:
+    #     print(node, g.nodes[node]["pt"], g.nodes[node]["eta"], g.nodes[node]["phi"], g.nodes[node]["pid"])
+    #     children = list(g.successors(node))
+    #     int_e = 0.0
+    #     for ch in children:
+    #         print("  ", ch, g.edges[node, ch]["weight"])
+    #         int_e += g.edges[node, ch]["weight"]
+    #     g.nodes[node]["interacting"] = int_e
+
+    cp_nodes = [n for n in g.nodes if n[0] == "cp"]
+    for node in cp_nodes:
+        children = list(g.successors(node))
         print(node, g.nodes[node]["pt"], g.nodes[node]["eta"], g.nodes[node]["phi"], g.nodes[node]["pid"], children)
 
 
@@ -133,63 +140,6 @@ def get_charge(pid):
         raise Exception("Unknown pid: ", pid)
 
 
-def draw_event(g):
-    pos = {}
-    for node in g.nodes:
-        pos[node] = (g.nodes[node]["eta"], g.nodes[node]["phi"])
-
-    fig = plt.figure(figsize=(10, 10))
-
-    nodes_to_draw = [n for n in g.nodes if n[0] == "elem"]
-    nx.draw_networkx(
-        g,
-        pos=pos,
-        with_labels=False,
-        node_size=5,
-        nodelist=nodes_to_draw,
-        edgelist=[],
-        node_color="red",
-        node_shape="s",
-        alpha=0.5,
-    )
-
-    nodes_to_draw = [n for n in g.nodes if n[0] == "pfcand"]
-    nx.draw_networkx(
-        g,
-        pos=pos,
-        with_labels=False,
-        node_size=10,
-        nodelist=nodes_to_draw,
-        edgelist=[],
-        node_color="green",
-        node_shape="x",
-        alpha=0.5,
-    )
-
-    nodes_to_draw = [n for n in g.nodes if (n[0] == "cp")]
-    nx.draw_networkx(
-        g,
-        pos=pos,
-        with_labels=False,
-        node_size=1,
-        nodelist=nodes_to_draw,
-        edgelist=[],
-        node_color="blue",
-        node_shape=".",
-        alpha=0.5,
-    )
-
-    # draw edges between genparticles and elements
-    edges_to_draw = [e for e in g.edges if e[0] in nodes_to_draw]
-    nx.draw_networkx_edges(g, pos, edgelist=edges_to_draw, arrows=False, alpha=0.1)
-
-    plt.xlim(-6, 6)
-    plt.ylim(-4, 4)
-    plt.tight_layout()
-    plt.axis("on")
-    return fig
-
-
 def compute_gen_met(g):
     genpart = [elem for elem in g.nodes if elem[0] == "cp"]
     px = np.sum([g.nodes[elem]["pt"] * np.cos(g.nodes[elem]["phi"]) for elem in genpart])
@@ -198,178 +148,65 @@ def compute_gen_met(g):
     return met
 
 
-def merge_closeby_particles(g, deltar_cut=0.01, max_iter=100):
-    print("merging closeby met={:.2f}".format(compute_gen_met(g)))
-
-    for it in range(max_iter):
-        particles_to_merge = [elem for elem in g.nodes if elem[0] == "cp"]
-        part_eta = [g.nodes[node]["eta"] for node in particles_to_merge]
-        part_phi = [g.nodes[node]["phi"] for node in particles_to_merge]
-
-        # find pairs that are close by in deltaR
-        # note that if there are >2 particles close by to each other, only the closest 2 get merged
-        merge_pairs = []
-        pairs_0, pairs_1 = deltar_pairs(part_eta, part_phi, deltar_cut)
-
-        # no closeby particles, break
-        if len(pairs_0) == 0:
-            break
-        merge_pairs = [(particles_to_merge[p0], particles_to_merge[p1]) for p0, p1 in zip(pairs_0, pairs_1)]
-
-        print("merging {} pairs".format(len(merge_pairs)))
-        for pair in merge_pairs:
-            if pair[0] in g.nodes and pair[1] in g.nodes:
-                lv = vector.obj(pt=0, eta=0, phi=0, E=0)
-                sum_pu = 0.0
-                sum_tot = 0.0
-                for gp in pair:
-                    lv += vector.obj(
-                        pt=g.nodes[gp]["pt"],
-                        eta=g.nodes[gp]["eta"],
-                        phi=g.nodes[gp]["phi"],
-                        E=g.nodes[gp]["e"],
-                    )
-                    sum_pu += g.nodes[gp]["ispu"] * g.nodes[gp]["e"]
-                    sum_tot += g.nodes[gp]["e"]
-
-                # now update the remaining particle properties
-                g.nodes[pair[0]]["pt"] = lv.pt
-                g.nodes[pair[0]]["eta"] = lv.eta
-                g.nodes[pair[0]]["phi"] = lv.phi
-                g.nodes[pair[0]]["e"] = lv.energy
-                g.nodes[pair[0]]["ispu"] = sum_pu / sum_tot
-                orig_pid = g.nodes[pair[0]]["pid"]
-                if g.nodes[pair[1]]["e"] > g.nodes[pair[0]]["e"]:
-                    orig_pid = g.nodes[pair[1]]["pid"]
-                g.nodes[pair[0]]["pid"] = orig_pid
-
-                # add edge weights from the deleted particle to the remaining particle
-                for suc in g.successors(pair[1]):
-                    if (pair[0], suc) in g.edges:
-                        g.edges[(pair[0], suc)]["weight"] += g.edges[(pair[1], suc)]["weight"]
-                g.remove_nodes_from([pair[1]])
-    print("done merging, met={:.2f}".format(compute_gen_met(g)))
-
-
-def cleanup_graph(g, node_energy_threshold=0.1, edge_energy_threshold=0.05):
-    g = g.copy()
-
-    print("start cleanup, met={:.2f}".format(compute_gen_met(g)))
-
-    # For each truth particle, compute the energy in tracks or calorimeter clusters
-    for node in g.nodes:
-
-        # CaloParticles or TrackingParticles
-        if node[0] == "cp":
-            E_track = 0.0
-            E_calo = 0.0
-            E_other = 0.0
-            E_hf = 0.0
-            E_hfem = 0.0
-            E_hfhad = 0.0
-
-            # remap PID to PF-like
-            g.nodes[node]["remap_pid"] = map_pdgid_to_candid(abs(g.nodes[node]["pid"]), g.nodes[node]["charge"])
-
-            for suc in g.successors(node):
-                elem_type = g.nodes[suc]["typ"]
-                if elem_type in [1, 6]:
-                    E_track += g.edges[node, suc]["weight"]
-                elif elem_type in [4, 5, 10, 11]:
-                    E_calo += g.edges[node, suc]["weight"]
-                elif elem_type in [8, 9]:
-                    if elem_type == 8:
-                        E_hfem += g.edges[node, suc]["weight"]
-                    elif elem_type == 9:
-                        E_hfhad += g.edges[node, suc]["weight"]
-                    E_hf += g.edges[node, suc]["weight"]
-                else:
-                    E_other += g.edges[node, suc]["weight"]
-
-            g.nodes[node]["E_track"] = E_track
-            g.nodes[node]["E_calo"] = E_calo
-            g.nodes[node]["E_other"] = E_other
-            g.nodes[node]["E_hf"] = E_hf
-            g.nodes[node]["E_hfem"] = E_hfem
-            g.nodes[node]["E_hfhad"] = E_hfhad
-
-    # If there are multiple tracks matched to a gen/sim particle, keep the association to the closest one by dR
-    for node in g.nodes:
-        if node[0] == "cp":
-            # collect tracks or GSFs
-            tracks = []
-            for suc in g.successors(node):
-                typ = g.nodes[suc]["typ"]
-                if typ == 1 or typ == 6:
-                    tracks.append(suc)
-            if len(tracks) > 1:
-                n0 = g.nodes[node]
-                drs = []
-                for tr in tracks:
-                    n1 = g.nodes[tr]
-                    deta = np.abs(n0["eta"] - n1["eta"])
-                    dphi = np.mod(n0["phi"] - n1["phi"] + np.pi, 2 * np.pi) - np.pi
-                    dr2 = deta**2 + dphi**2
-                    drs.append(dr2)
-                imin = np.argmin(drs)
-
-                # set the weight of the edge to the other tracks to 0
-                for itr in range(len(tracks)):
-                    if itr != imin:
-                        g.edges[(node, tracks[itr])]["weight"] = 0.0
+def split_caloparticles(g, elem_type):
 
-    for node in g.nodes:
-        if node[0] == "cp":
-            remap_pid = g.nodes[node]["remap_pid"]
-
-            # charged particles that leave no track should not be reconstructed as charged
-            if remap_pid in [211, 13] and g.nodes[node]["E_track"] == 0:
-                g.nodes[node]["remap_pid"] = 130
-                g.nodes[node]["charge"] = 0
-            if remap_pid in [11] and g.nodes[node]["E_track"] == 0:
-                g.nodes[node]["remap_pid"] = 22
-                g.nodes[node]["charge"] = 0
-
-            # if a particle only leaves deposits in the HF, it should be reconstructed as an HF candidate
-            if (g.nodes[node]["E_track"] == 0) and (g.nodes[node]["E_calo"] == 0) and (g.nodes[node]["E_other"] == 0) and g.nodes[node]["E_hf"] > 0:
-                if g.nodes[node]["E_hfhad"] > g.nodes[node]["E_hfem"]:
-                    g.nodes[node]["remap_pid"] = 1
-                    g.nodes[node]["charge"] = 0
-                else:
-                    g.nodes[node]["remap_pid"] = 2
-                    g.nodes[node]["charge"] = 0
-
-    # CaloParticles contain a lot of electrons and muons with a soft pt spectrum
-    # these should not be attempted to be reconstructed as ele/mu, but rather as charged or neutral hadrons
-    for node in g.nodes:
-        if node[0] == "cp":
-            nd = g.nodes[node]
-            if nd["pt"] < 1.0 and (abs(nd["remap_pid"]) == 11 or abs(nd["remap_pid"]) == 13):
-                if g.nodes[node]["E_track"] > g.nodes[node]["E_calo"]:
-                    g.nodes[node]["remap_pid"] = 211
-                else:
-                    if abs(nd["remap_pid"]) == 11:
-                        g.nodes[node]["remap_pid"] = 22
-                    else:
-                        g.nodes[node]["remap_pid"] = 130
-                    g.nodes[node]["charge"] = 0
-
-    # remove calopart/trackingpart not linked to any elements
-    # as these are not reconstructable in principle
-    nodes_to_remove = []
-    for node in g.nodes:
-        if node[0] == "cp":
-            deg = g.degree[node]
-            if deg == 0:
-                nodes_to_remove += [node]
-    g.remove_nodes_from(nodes_to_remove)
-    print("unlinked cleanup, met={:.2f}".format(compute_gen_met(g)))
+    # loop over caloparticles
+    cps = [(g.nodes[n]["pt"], n) for n in g.nodes if n[0] == "cp"]
+    for _, cp in cps:
 
-    return g
+        # get all associated elements with type==elem_type that received a contribution from this caloparticle
+        sucs = [(suc, g.edges[cp, suc]["weight"], g.nodes[suc]["e"]) for suc in g.successors(cp) if g.nodes[suc]["typ"] == elem_type]
+        sum_sucs_w = sum([s[1] for s in sucs])
+        sucs = [s[0] for s in sucs]
+        if len(sucs) > 1:
+            lv = vector.obj(
+                pt=g.nodes[cp]["pt"],
+                eta=g.nodes[cp]["eta"],
+                phi=g.nodes[cp]["phi"],
+                e=g.nodes[cp]["e"],
+            )
+            lv_fracs = []
+            for suc in sucs:
+                frac_e = g.edges[cp, suc]["weight"] / sum_sucs_w
+                lv_fracs.append(lv * frac_e)
+
+            max_cp = max([n[1] for n in g.nodes if n[0] == "cp"]) + 1
+            new_cp_index = max_cp
+            for lv_frac, suc in zip(lv_fracs, sucs):
+                g.add_node(
+                    ("cp", new_cp_index),
+                    pt=lv_frac.pt,
+                    eta=lv_frac.eta,
+                    phi=lv_frac.phi,
+                    e=lv_frac.e,
+                    charge=g.nodes[cp]["charge"],
+                    pid=g.nodes[cp]["pid"],
+                    ispu=g.nodes[cp]["ispu"],
+                )
+                g.add_edge(("cp", new_cp_index), suc, weight=g.edges[cp, suc]["weight"])
+                new_cp_index += 1
+            g.remove_node(cp)
+
+
+def find_representative_elements(g, elem_to_gp, gp_to_elem, elem_type):
+    unused_elems = []
+    elems = [(g.nodes[e]["pt"], e) for e in g.nodes if e[0] == "elem" and g.nodes[e]["typ"] == elem_type]
+    elems_sorted = sorted(elems, key=lambda x: x[0], reverse=True)
+    for _, elem in elems_sorted:
+        gps = list(g.predecessors(elem))
+        gps_weight = [(g.edges[(gp, elem)]["weight"], gp) for gp in gps if gp not in gp_to_elem if gp[0] == "cp"]
+        gps_weight_sorted = sorted(gps_weight, key=lambda x: x[0], reverse=True)
+        if len(gps_weight_sorted) > 0:
+            gp = gps_weight_sorted[0][1]
+            elem_to_gp[elem] = gp
+            gp_to_elem[gp] = elem
+        else:
+            unused_elems.append(elem)
 
 
-def prepare_normalized_table(g, genparticle_energy_threshold=0.2):
-    # rg = g.reverse()
+def prepare_normalized_table(g, iev):
+    split_caloparticles(g, 1)
+    # pickle.dump(g, open("split_g_{}.pkl".format(iev), "wb"), pickle.HIGHEST_PROTOCOL)
 
     all_genparticles = []
     all_elements = []
@@ -385,33 +222,34 @@ def prepare_normalized_table(g, genparticle_energy_threshold=0.2):
     all_genparticles = list(set(all_genparticles))
     all_elements = sorted(all_elements)
 
-    # assign genparticles in reverse energy order uniquely to best element
     elem_to_gp = {}  # map of element -> genparticles
-    unmatched_gp = []
-    for gp in sorted(all_genparticles, key=lambda x: g.nodes[x]["e"], reverse=True):
-        elems = [e for e in g.successors(gp)]
-
-        # sort elements by energy deposit from genparticle
-        elems_sorted = sorted(
-            [(g.edges[gp, e]["weight"], e) for e in elems],
-            key=lambda x: x[0],
-            reverse=True,
-        )
-
-        chosen_elem = None
-        for weight, elem in elems_sorted:
-            if not (elem in elem_to_gp):
-                chosen_elem = elem
-                elem_to_gp[elem] = []
-                break
-
-        if chosen_elem is None:
-            unmatched_gp += [gp]
-        else:
-            elem_to_gp[elem] += [gp]
+    gp_to_elem = {}  # map of genparticle -> element
+
+    # assign genparticles in reverse pt order uniquely to best element
+    find_representative_elements(g, elem_to_gp, gp_to_elem, 1)
+    find_representative_elements(g, elem_to_gp, gp_to_elem, 6)
+    find_representative_elements(g, elem_to_gp, gp_to_elem, 4)
+    find_representative_elements(g, elem_to_gp, gp_to_elem, 5)
+    find_representative_elements(g, elem_to_gp, gp_to_elem, 8)
+    find_representative_elements(g, elem_to_gp, gp_to_elem, 9)
+    find_representative_elements(g, elem_to_gp, gp_to_elem, 10)
+    find_representative_elements(g, elem_to_gp, gp_to_elem, 11)
+
+    s1 = set(list(gp_to_elem.keys()))
+    s2 = set(all_genparticles)
+    unmatched_gp = list(s2 - s1)
+    # for gp in unmatched_gp:
+    #     print(gp, g.nodes[gp]["pid"], g.nodes[gp]["pt"])
+
+    # s1 = set(list(elem_to_gp.keys()))
+    # s2 = set(all_elements)
+    # unmatched_elem = list(s2 - s1)
+    # for elem in unmatched_elem:
+    #     print(elem, g.nodes[elem]["typ"], g.nodes[elem]["pt"])
 
     # assign unmatched genparticles to best element, allowing for overlaps
-    for gp in sorted(unmatched_gp, key=lambda x: g.nodes[x]["e"], reverse=True):
+    elem_to_gp = {k: [v] for k, v in elem_to_gp.items()}
+    for gp in sorted(unmatched_gp, key=lambda x: g.nodes[x]["pt"], reverse=True):
         elems = [e for e in g.successors(gp)]
         elems_sorted = sorted(
             [(g.edges[gp, e]["weight"], e) for e in elems],
@@ -421,12 +259,11 @@ def prepare_normalized_table(g, genparticle_energy_threshold=0.2):
         _, elem = elems_sorted[0]
         elem_to_gp[elem] += [gp]
 
+    # Find primary element for each PFCandidate
     unmatched_cand = []
     elem_to_cand = {}
-
-    # Find primary element for each PFCandidate
-    for cand in sorted(all_pfcandidates, key=lambda x: g.nodes[x]["e"], reverse=True):
-        tp = g.nodes[cand]["typ"]
+    for cand in sorted(all_pfcandidates, key=lambda x: g.nodes[x]["pt"], reverse=True):
+        tp = g.nodes[cand]["pid"]
         neighbors = list(g.predecessors(cand))
 
         chosen_elem = None
@@ -479,13 +316,11 @@ def prepare_normalized_table(g, genparticle_energy_threshold=0.2):
     ycand.fill(0.0)
 
     for ielem, elem in enumerate(all_elements):
-        elem_type = g.nodes[elem]["typ"]
         genparticles = sorted(
             elem_to_gp.get(elem, []),
             key=lambda x: g.edges[(x, elem)]["weight"],
             reverse=True,
         )
-        # genparticles = [gp for gp in genparticles if g.nodes[gp]["e"] > genparticle_energy_threshold]
         candidate = elem_to_cand.get(elem, None)
 
         for j in range(len(elem_branches)):
@@ -499,13 +334,12 @@ def prepare_normalized_table(g, genparticle_energy_threshold=0.2):
 
         # if several CaloParticles/TrackingParticles are associated to ONLY this element, merge them, as they are not reconstructable separately
         if len(genparticles) > 0:
+            pids_e = sorted([(g.nodes[gp]["pid"], g.nodes[gp]["e"]) for gp in genparticles], key=lambda x: x[1], reverse=True)
+            # get the pid of the highest-energy particle associated with this element
+            pid = pids_e[0][0]
 
-            orig_pid = [(g.nodes[gp]["pid"], g.nodes[gp]["e"]) for gp in genparticles]
-            orig_pid = sorted(orig_pid, key=lambda x: x[1], reverse=True)
-            orig_pid = orig_pid[0][0]
-
-            pid = g.nodes[genparticles[0]]["remap_pid"]
             charge = g.nodes[genparticles[0]]["charge"]
+            pid = map_pdgid_to_candid(pid, charge)
 
             sum_pu = 0.0
             sum_tot = 0.0
@@ -519,42 +353,23 @@ def prepare_normalized_table(g, genparticle_energy_threshold=0.2):
                 sum_pu += g.nodes[gp]["ispu"] * g.nodes[gp]["e"]
                 sum_tot += g.nodes[gp]["e"]
 
-            # remap PID in case of HCAL cluster to neutral
-            if elem_type == 5 and (pid == 22 or pid == 11):
-                pid = 130
-
-            # remap forward region to HFHAD or HFEM
-            if elem_type in [8, 9]:
-                if pid == 130:
-                    pid = 1
-                elif pid == 22:
-                    pid = 2
-
-            # Remap HF candidates to neutral hadron or photon in case not matched to HF
-            if elem_type in [2, 3, 4, 5]:
-                if pid == 1:
-                    pid = 130
-                elif pid == 2:
-                    pid = 22
-
             gp = {
                 "pt": lv.rho,
                 "eta": lv.eta,
                 "sin_phi": np.sin(lv.phi),
                 "cos_phi": np.cos(lv.phi),
                 "e": lv.t,
-                "typ": pid,
-                "orig_pid": orig_pid,
+                "pid": pid,
                 "px": lv.x,
                 "py": lv.y,
                 "pz": lv.z,
                 "ispu": sum_pu / sum_tot,
-                "charge": charge if pid in [211, 11, 13] else 0,
+                "charge": charge,
             }
-            # print("  mlpf: type={} E={:.2f} eta={:.2f} phi={:.2f} q={}".format(pid, lv.t, lv.eta, lv.phi, gp["charge"]))
 
             for j in range(len(target_branches)):
                 ygen[target_branches[j]][ielem] = gp[target_branches[j]]
+
     px = np.sum(ygen["pt"] * ygen["cos_phi"])
     py = np.sum(ygen["pt"] * ygen["sin_phi"])
     met = np.sqrt(px**2 + py**2)
@@ -639,6 +454,14 @@ def make_graph(ev, iev):
     caloparticle_ev = ev["caloparticle_ev"][iev]
     caloparticle_idx_trackingparticle = ev["caloparticle_idx_trackingparticle"][iev]
 
+    simcluster_pid = ev["simcluster_pid"][iev]
+    simcluster_pt = ev["simcluster_pt"][iev]
+    simcluster_e = ev["simcluster_energy"][iev]
+    simcluster_eta = ev["simcluster_eta"][iev]
+    simcluster_phi = ev["simcluster_phi"][iev]
+    simcluster_idx_trackingparticle = ev["simcluster_idx_trackingparticle"][iev]
+    simcluster_idx_caloparticle = ev["simcluster_idx_caloparticle"][iev]
+
     pfcandidate_pdgid = ev["pfcandidate_pdgid"][iev]
     pfcandidate_pt = ev["pfcandidate_pt"][iev]
     pfcandidate_e = ev["pfcandidate_energy"][iev]
@@ -719,7 +542,7 @@ def make_graph(ev, iev):
     for iobj in range(len(gen_pdgid)):
         g.add_node(
             ("gen", iobj),
-            typ=gen_pdgid[iobj],
+            pid=abs(gen_pdgid[iobj]),
             pt=gen_pt[iobj],
             e=gen_e[iobj],
             eta=gen_eta[iobj],
@@ -735,7 +558,7 @@ def make_graph(ev, iev):
     for iobj in range(len(trackingparticle_pid)):
         g.add_node(
             ("tp", iobj),
-            pid=trackingparticle_pid[iobj],
+            pid=abs(trackingparticle_pid[iobj]),
             charge=trackingparticle_charge[iobj],
             pt=trackingparticle_pt[iobj],
             e=trackingparticle_e[iobj],
@@ -746,13 +569,9 @@ def make_graph(ev, iev):
 
     # CaloParticles
     for iobj in range(len(caloparticle_pid)):
-        if abs(caloparticle_pid[iobj]) == 15:
-            print(
-                "tau caloparticle pt={}, this will introduce fake MET due to inclusion of neutrino in the caloparticle".format(caloparticle_pt[iobj])
-            )
         g.add_node(
             ("cp", iobj),
-            pid=caloparticle_pid[iobj],
+            pid=abs(caloparticle_pid[iobj]),
             charge=caloparticle_charge[iobj],
             pt=caloparticle_pt[iobj],
             e=caloparticle_e[iobj],
@@ -760,12 +579,32 @@ def make_graph(ev, iev):
             phi=caloparticle_phi[iobj],
             ispu=float(caloparticle_ev[iobj] != 0),
         )
+        itp = caloparticle_idx_trackingparticle[iobj]
+        if itp != -1:
+            g.add_edge(("cp", iobj), ("tp", itp))
+
+    # SimClusters
+    for iobj in range(len(simcluster_pid)):
+        g.add_node(
+            ("sc", iobj),
+            pid=abs(simcluster_pid[iobj]),
+            pt=simcluster_pt[iobj],
+            eta=simcluster_eta[iobj],
+            phi=simcluster_phi[iobj],
+            e=simcluster_e[iobj],
+        )
+        icp = simcluster_idx_caloparticle[iobj]
+        g.add_edge(("cp", icp), ("sc", iobj))
+
+        itp = simcluster_idx_trackingparticle[iobj]
+        if itp != -1:
+            g.add_edge(("sc", iobj), ("tp", itp))
 
     # baseline PF for cross-checks
     for iobj in range(len(pfcandidate_pdgid)):
         g.add_node(
             ("pfcand", iobj),
-            typ=abs(pfcandidate_pdgid[iobj]),
+            pid=abs(pfcandidate_pdgid[iobj]),
             pt=pfcandidate_pt[iobj],
             e=pfcandidate_e[iobj],
             eta=pfcandidate_eta[iobj],
@@ -773,61 +612,126 @@ def make_graph(ev, iev):
             cos_phi=np.cos(pfcandidate_phi[iobj]),
             charge=get_charge(pfcandidate_pdgid[iobj]),
             ispu=0.0,  # for PF candidates, we don't know if it was PU or not
-            orig_pid=0,  # placeholder to match processed gp
         )
 
     trackingparticle_to_element_first = ev["trackingparticle_to_element.first"][iev]
     trackingparticle_to_element_second = ev["trackingparticle_to_element.second"][iev]
     trackingparticle_to_element_cmp = ev["trackingparticle_to_element_cmp"][iev]
-    # for trackingparticles associated to elements, set a very high edge weight
-    for tp, elem, c in zip(
+    for iobj, elem, c in zip(
         trackingparticle_to_element_first,
         trackingparticle_to_element_second,
         trackingparticle_to_element_cmp,
     ):
-        # ignore BREM, because the TrackingParticle is already linked to GSF
-        if g.nodes[("elem", elem)]["typ"] in [7]:
+        if g.nodes[("elem", elem)]["typ"] in [2, 3, 7]:
             continue
-        g.add_edge(("tp", tp), ("elem", elem), weight=c)
+        if ("tp", iobj) in g.nodes and ("elem", elem) in g.nodes:
+            # print(("tp", iobj), ("elem", elem), c)
+            g.add_edge(("tp", iobj), ("elem", elem), weight=c * g.nodes[("elem", elem)]["e"])
 
     caloparticle_to_element_first = ev["caloparticle_to_element.first"][iev]
     caloparticle_to_element_second = ev["caloparticle_to_element.second"][iev]
     caloparticle_to_element_cmp = ev["caloparticle_to_element_cmp"][iev]
-    for sc, elem, c in zip(
+    for iobj, elem, c in zip(
         caloparticle_to_element_first,
         caloparticle_to_element_second,
         caloparticle_to_element_cmp,
     ):
-        if not (g.nodes[("elem", elem)]["typ"] in [7]):
-            g.add_edge(("cp", sc), ("elem", elem), weight=c)
+        if g.nodes[("elem", elem)]["typ"] in [2, 3, 7]:
+            continue
+        if ("cp", iobj) in g.nodes and ("elem", elem) in g.nodes:
+            # print(("cp", iobj), ("elem", elem), c)
+            g.add_edge(("cp", iobj), ("elem", elem), weight=c)
+
+    simcluster_to_element_first = ev["simcluster_to_element.first"][iev]
+    simcluster_to_element_second = ev["simcluster_to_element.second"][iev]
+    simcluster_to_element_cmp = ev["simcluster_to_element_cmp"][iev]
+    for iobj, elem, c in zip(
+        simcluster_to_element_first,
+        simcluster_to_element_second,
+        simcluster_to_element_cmp,
+    ):
+        if not (g.nodes[("elem", elem)]["typ"] in [2, 3, 7]):
+            # print(("sc", iobj), ("elem", elem), c)
+            g.add_edge(("sc", iobj), ("elem", elem), weight=c)
 
     print("make_graph init, met={:.2f}".format(compute_gen_met(g)))
 
-    # merge caloparticles and trackingparticles that refer to the same particle
+    # add children of trackingparticle to parent
+    tps = [n for n in g.nodes if n[0] == "tp"]
+    for tp in tps:
+        preds = g.predecessors(tp)
+        sucs = g.successors(tp)
+        for pred in preds:
+            for suc in sucs:
+                if (pred, suc) not in g.edges:
+                    g.add_edge(pred, suc, weight=g.edges[(tp, suc)]["weight"])
+    g.remove_nodes_from(tps)
+
+    # pickle.dump(g, open("init_g_{}.pkl".format(iev), "wb"), pickle.HIGHEST_PROTOCOL)
+
+    # add any remaining links between CaloParticles and Elements using delta-R proximity
+    elems = [n for n in g.nodes if n[0] == "elem"]
+    scs = [node for node in g.nodes if node[0] == "sc"]
+    sc_coords = np.array([[g.nodes[n]["eta"] for n in scs], [g.nodes[n]["phi"] for n in scs]])
+    tree = KDTree(sc_coords.T, leaf_size=32)
+    for elem in elems:
+        if len(list(g.predecessors(elem))) == 0 and g.nodes[elem]["pt"] > 1:
+            eta = g.nodes[elem]["eta"]
+            phi = g.nodes[elem]["phi"]
+            nearby_scs = tree.query_radius([[eta, phi]], 0.05)[0]
+            for isc in nearby_scs:
+                if scs[isc] in g.nodes:
+                    if (scs[isc], elem) not in g.edges:
+                        g.add_edge(scs[isc], elem, weight=g.nodes[elem]["e"])
+
+    # add children of simcluster to parent
+    scs = [n for n in g.nodes if n[0] == "sc"]
+    for sc in scs:
+        preds = g.predecessors(sc)
+        sucs = g.successors(sc)
+        for pred in preds:
+            for suc in sucs:
+                if (pred, suc) not in g.edges:
+                    g.add_edge(pred, suc, weight=g.edges[(sc, suc)]["weight"])
+
+    g.remove_nodes_from(scs)
+    print("make_graph duplicates removed, met={:.2f}".format(compute_gen_met(g)))
+
+    elems = [n for n in g.nodes if n[0] == "elem"]
     nodes_to_remove = []
-    for idx_cp, idx_tp in enumerate(caloparticle_idx_trackingparticle):
-        if idx_tp != -1:
-
-            # add all the edges from the trackingparticle to the caloparticle
-            for elem in g.neighbors(("tp", idx_tp)):
-                g.add_edge(
-                    ("cp", idx_cp),
-                    elem,
-                    weight=g.edges[("tp", idx_tp), elem]["weight"],
-                )
-            # remove the trackingparticle, keep the caloparticle
-            nodes_to_remove += [("tp", idx_tp)]
+    for elem in elems:
+        if g.nodes[elem]["typ"] in [2, 3, 7]:
+            nodes_to_remove.append(elem)
     g.remove_nodes_from(nodes_to_remove)
-    print("make_graph duplicates removed, met={:.2f}".format(compute_gen_met(g)))
 
     # merge_closeby_particles(g)
-    # print("cleanup done, met={:.2f}".format(compute_gen_met(g)))
+    print("cleanup done, met={:.2f}".format(compute_gen_met(g)))
 
     element_to_candidate_first = ev["element_to_candidate.first"][iev]
     element_to_candidate_second = ev["element_to_candidate.second"][iev]
     for elem, pfcand in zip(element_to_candidate_first, element_to_candidate_second):
-        g.add_edge(("elem", elem), ("pfcand", pfcand), weight=1.0)
+        if ("elem", elem) in g.nodes:
+            g.add_edge(("elem", elem), ("pfcand", pfcand), weight=1.0)
 
+    # pickle.dump(g, open("cleanup_g_{}.pkl".format(iev), "wb"), pickle.HIGHEST_PROTOCOL)
+    # print_gen(g)
+    return g
+
+
+def cleanup_graph(g):
+    all_removed_edges = []
+    elems = [n for n in g.nodes if n[0] == "elem"]
+    for elem in elems:
+        edges_to_remove = []
+        for pred in g.predecessors(elem):
+            edge = (pred, elem)
+            if g.edges[edge]["weight"] / g.nodes[elem]["e"] < 0.1:
+                edges_to_remove.append(edge)
+        all_removed_edges += edges_to_remove
+    # print("removed edges:", all_removed_edges)
+    # for edge in all_removed_edges:
+    #     print(g.nodes[edge[0]]["e"], g.nodes[edge[1]]["e"], g.edges[edge]["weight"])
+    g.remove_edges_from(all_removed_edges)
     return g
 
 
@@ -847,10 +751,10 @@ def process(args):
     for iev in tqdm.tqdm(events_to_process):
         print("processing iev={}, genmet_cmssw={:.2f}".format(iev, ev["genmet_pt"][iev][0]))
         g = make_graph(ev, iev)
-        g = cleanup_graph(g)
+        # g = cleanup_graph(g)
 
         # associate target particles to input elements
-        Xelem, ycand, ygen = prepare_normalized_table(g)
+        Xelem, ycand, ygen = prepare_normalized_table(g, iev)
         data = {}
 
         # produce a list of stable pythia particles for downstream validation
@@ -860,7 +764,7 @@ def process(args):
             for n in g.nodes
             if n[0] == "gen" and ((g.nodes[n]["status"] == 1) or ((g.nodes[n]["status"] == 2) and g.nodes[n]["num_daughters"] == 0))
         ]
-        feats = ["typ", "pt", "eta", "phi", "e"]
+        feats = ["pid", "pt", "eta", "phi", "e"]
         arr_ptcls_pythia = np.array([[g.nodes[n][f] for f in feats] for n in ptcls_pythia])
 
         # produce pythia-level genjets and genmet
@@ -885,10 +789,6 @@ def process(args):
             "genmet": genmet,
         }
 
-        # print("trk", ygen[Xelem["typ"] == 1]["typ"])
-        # print("ecal", ygen[Xelem["typ"] == 4]["typ"])
-        # print("hcal", ygen[Xelem["typ"] == 4]["typ"])
-
         if args.save_full_graph:
             data["full_graph"] = g
 
diff --git a/mlpf/data_cms/postprocessing_jobs.py b/mlpf/data_cms/postprocessing_jobs.py
index 76f70d313..685470a3d 100644
--- a/mlpf/data_cms/postprocessing_jobs.py
+++ b/mlpf/data_cms/postprocessing_jobs.py
@@ -8,7 +8,7 @@ def chunks(lst, n):
         yield lst[i : i + n]
 
 
-def write_script(postprocessing_cmd, infiles, outfiles):
+def write_script(infiles, outfiles):
     s = []
     s += ["#!/bin/bash"]
     s += ["#SBATCH --partition short"]
@@ -23,7 +23,7 @@ def write_script(postprocessing_cmd, infiles, outfiles):
         outf_no_bzip = outf.replace(".pkl.bz2", ".pkl")
         s += [f"if [ ! -f {outf} ]; then"]
         s += [
-            "  singularity exec -B /local /home/software/singularity/pytorch.simg:2024-06-26"
+            "  singularity exec -B /local /home/software/singularity/pytorch.simg:2024-08-18"
             + f" python3 mlpf/data_cms/postprocessing2.py --input {inf} --outpath {outpath}"
         ]
         s += [f"  bzip2 -z {outf_no_bzip}"]
@@ -33,24 +33,14 @@ def write_script(postprocessing_cmd, infiles, outfiles):
 
 
 samples = [
-    "/local/joosep/mlpf/cms/v3_3/nopu/SingleProtonMinusFlatPt0p7To1000_cfi",
-    "/local/joosep/mlpf/cms/v3_3/nopu/SingleMuFlatPt1To1000_pythia8_cfi",
-    "/local/joosep/mlpf/cms/v3_3/nopu/TTbar_14TeV_TuneCUETP8M1_cfi",
-    "/local/joosep/mlpf/cms/v3_3/nopu/SingleK0FlatPt1To1000_pythia8_cfi",
-    "/local/joosep/mlpf/cms/v3_3/nopu/SinglePi0Pt1To1000_pythia8_cfi",
-    "/local/joosep/mlpf/cms/v3_3/nopu/SingleGammaFlatPt1To1000_pythia8_cfi",
-    "/local/joosep/mlpf/cms/v3_3/nopu/SinglePiMinusFlatPt0p7To1000_cfi",
-    "/local/joosep/mlpf/cms/v3_3/nopu/SingleNeutronFlatPt0p7To1000_cfi",
-    "/local/joosep/mlpf/cms/v3_3/nopu/SingleElectronFlatPt1To1000_pythia8_cfi",
-    "/local/joosep/mlpf/cms/v3_3/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi",
-    "/local/joosep/mlpf/cms/v3_3/pu55to75/QCDForPF_14TeV_TuneCUETP8M1_cfi",
+    "/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi",
 ]
 
 ichunk = 1
 for sample in samples:
-    infiles = list(glob.glob(f"{sample}/root/*.root"))
+    infiles = list(glob.glob(f"{sample}/root/pfntuple*.root"))
     for infiles_chunk in chunks(infiles, 10):
-        outfiles_chunk = [inf.replace(".root", ".pkl.bz2").replace("/root/", "/raw_orig/") for inf in infiles_chunk]
+        outfiles_chunk = [inf.replace(".root", ".pkl.bz2").replace("/root/", "/raw2/") for inf in infiles_chunk]
         os.makedirs(os.path.dirname(outfiles_chunk[0]), exist_ok=True)
         scr = write_script(infiles_chunk, outfiles_chunk)
         ofname = f"jobscripts/postproc_{ichunk}.sh"
diff --git a/mlpf/data_cms/prepare_args.py b/mlpf/data_cms/prepare_args.py
index 0b1d7a557..a07892e56 100644
--- a/mlpf/data_cms/prepare_args.py
+++ b/mlpf/data_cms/prepare_args.py
@@ -3,30 +3,30 @@
 
 import os
 
-outdir = "/local/joosep/mlpf/cms/20240702_cptruthdef"
+outdir = "/local/joosep/mlpf/cms/20240823_simcluster"
 
 samples = [
-    ("TTbar_14TeV_TuneCUETP8M1_cfi",                           100000, 120010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
+#    ("TTbar_14TeV_TuneCUETP8M1_cfi",                           105000, 110010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
 #    ("ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi",                200000, 220010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
-    ("QCDForPF_14TeV_TuneCUETP8M1_cfi",                        300000, 320010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
+    ("QCDForPF_14TeV_TuneCUETP8M1_cfi",                        300000, 305000, "genjob_pu55to75.sh", outdir + "/pu55to75"),
 #    ("SMS-T1tttt_mGl-1500_mLSP-100_TuneCP5_14TeV_pythia8_cfi", 500000, 520010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
 #    ("ZpTT_1500_14TeV_TuneCP5_cfi",                            600000, 620010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
-    ("VBF_TuneCP5_14TeV_pythia8_cfi",                          700000, 720010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
-
-    ("TTbar_14TeV_TuneCUETP8M1_cfi",                           700000, 720010, "genjob_nopu.sh", outdir + "/nopu"),
-    ("MultiParticlePFGun50_cfi",                               800000, 820000, "genjob_nopu.sh", outdir + "/nopu"),
-    ("VBF_TuneCP5_14TeV_pythia8_cfi",                         900000, 920010, "genjob_nopu.sh", outdir + "/nopu"),
-    ("QCDForPF_14TeV_TuneCUETP8M1_cfi",                      1000000,1020010, "genjob_nopu.sh", outdir + "/nopu"),
-
-    ("SingleElectronFlatPt1To1000_pythia8_cfi",                900000, 901000, "genjob_nopu.sh", outdir + "/nopu"),
-#    ("SingleGammaFlatPt1To1000_pythia8_cfi",                  1000000,1000100, "genjob_nopu.sh", outdir + "/nopu"),
-#    ("SingleMuFlatPt1To1000_pythia8_cfi",                     1100000,1100100, "genjob_nopu.sh", outdir + "/nopu"),
-#    ("SingleNeutronFlatPt0p7To1000_cfi",                      1200000,1200100, "genjob_nopu.sh", outdir + "/nopu"),
-#    ("SinglePi0Pt1To1000_pythia8_cfi",                        1300000,1300100, "genjob_nopu.sh", outdir + "/nopu"),
-#    ("SinglePiMinusFlatPt0p7To1000_cfi",                      1400000,1400100, "genjob_nopu.sh", outdir + "/nopu"),
-#    ("SingleProtonMinusFlatPt0p7To1000_cfi",                  1500000,1500100, "genjob_nopu.sh", outdir + "/nopu"),
-#    ("SingleTauFlatPt1To1000_cfi",                            1600000,1610000, "genjob_nopu.sh", outdir + "/nopu"),
-#    ("SingleK0FlatPt1To1000_pythia8_cfi",                     1700000,1700100, "genjob_nopu.sh", outdir + "/nopu"),
+#    ("VBF_TuneCP5_14TeV_pythia8_cfi",                          700000, 720010, "genjob_pu55to75.sh", outdir + "/pu55to75"),
+
+#    ("TTbar_14TeV_TuneCUETP8M1_cfi",                           702000, 705000, "genjob_nopu.sh", outdir + "/nopu"),
+#    ("MultiParticlePFGun50_cfi",                               800000, 820000, "genjob_nopu.sh", outdir + "/nopu"),
+#    ("VBF_TuneCP5_14TeV_pythia8_cfi",                         900000, 920010, "genjob_nopu.sh", outdir + "/nopu"),
+#    ("QCDForPF_14TeV_TuneCUETP8M1_cfi",                      1000000,1020010, "genjob_nopu.sh", outdir + "/nopu"),
+
+#    ("SingleElectronFlatPt1To1000_pythia8_cfi",                900000, 900010, "genjob_nopu.sh", outdir + "/nopu"),
+#    ("SingleGammaFlatPt1To1000_pythia8_cfi",                  1000000,1000010, "genjob_nopu.sh", outdir + "/nopu"),
+#    ("SingleMuFlatPt1To1000_pythia8_cfi",                     1100000,1100010, "genjob_nopu.sh", outdir + "/nopu"),
+#    ("SingleNeutronFlatPt0p7To1000_cfi",                      1200000,1200010, "genjob_nopu.sh", outdir + "/nopu"),
+#    ("SinglePi0Pt1To1000_pythia8_cfi",                        1300000,1300010, "genjob_nopu.sh", outdir + "/nopu"),
+#    ("SinglePiMinusFlatPt0p7To1000_cfi",                      1400000,1400010, "genjob_nopu.sh", outdir + "/nopu"),
+#    ("SingleProtonMinusFlatPt0p7To1000_cfi",                  1500000,1500010, "genjob_nopu.sh", outdir + "/nopu"),
+#    ("SingleTauFlatPt1To1000_cfi",                            1600000,1600010, "genjob_nopu.sh", outdir + "/nopu"),
+#    ("SingleK0FlatPt1To1000_pythia8_cfi",                     1700000,1700010, "genjob_nopu.sh", outdir + "/nopu"),
 ]
 
 if __name__ == "__main__":
@@ -36,6 +36,6 @@
         os.makedirs(this_outdir + "/" + samp + "/root", exist_ok=True)
 
         for seed in range(seed0, seed1):
-            p = this_outdir + "/" + samp + "/raw/pfntuple_{}.pkl.bz2".format(seed)
+            p = this_outdir + "/" + samp + "/raw2/pfntuple_{}.pkl.bz2".format(seed)
             if not os.path.isfile(p):
                 print(f"sbatch --mem-per-cpu 8G --partition main --time 20:00:00 --cpus-per-task 1 scripts/tallinn/cmssw-el8.sh mlpf/data_cms/{script} {samp} {seed}")
diff --git a/mlpf/heptfds/cms_pf/cms_utils.py b/mlpf/heptfds/cms_pf/cms_utils.py
index db0f8fe67..4f1b8cce9 100644
--- a/mlpf/heptfds/cms_pf/cms_utils.py
+++ b/mlpf/heptfds/cms_pf/cms_utils.py
@@ -144,8 +144,8 @@ def prepare_data_cms(fn):
         Xelem["sin_phi"] = np.sin(Xelem["phi"])
         Xelem["cos_phi"] = np.cos(Xelem["phi"])
         Xelem["typ_idx"] = np.array([ELEM_LABELS_CMS.index(int(i)) for i in Xelem["typ"]], dtype=np.float32)
-        ygen["typ_idx"] = np.array([CLASS_LABELS_CMS.index(abs(int(i))) for i in ygen["typ"]], dtype=np.float32)
-        ycand["typ_idx"] = np.array([CLASS_LABELS_CMS.index(abs(int(i))) for i in ycand["typ"]], dtype=np.float32)
+        ygen["typ_idx"] = np.array([CLASS_LABELS_CMS.index(abs(int(i))) for i in ygen["pid"]], dtype=np.float32)
+        ycand["typ_idx"] = np.array([CLASS_LABELS_CMS.index(abs(int(i))) for i in ycand["pid"]], dtype=np.float32)
 
         Xelem_flat = ak.to_numpy(
             np.stack(
diff --git a/mlpf/heptfds/cms_pf/ttbar.py b/mlpf/heptfds/cms_pf/ttbar.py
index 2eec180d7..077ed97fd 100644
--- a/mlpf/heptfds/cms_pf/ttbar.py
+++ b/mlpf/heptfds/cms_pf/ttbar.py
@@ -21,7 +21,7 @@
 class CmsPfTtbar(tfds.core.GeneratorBasedBuilder):
     """DatasetBuilder for cms_pf dataset."""
 
-    VERSION = tfds.core.Version("2.1.0")
+    VERSION = tfds.core.Version("2.2.0")
     RELEASE_NOTES = {
         "1.0.0": "Initial release.",
         "1.1.0": "Add muon type, fix electron GSF association",
@@ -37,6 +37,7 @@ class CmsPfTtbar(tfds.core.GeneratorBasedBuilder):
         "1.8.0": "Add ispu, genjets, genmet; disable genjet_idx; improved merging",
         "2.0.0": "New truth def based primarily on CaloParticles",
         "2.1.0": "Additional stats",
+        "2.2.0": "Split CaloParticles along tracks",
     }
     MANUAL_DOWNLOAD_INSTRUCTIONS = """
     rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_ttbar ~/tensorflow_datasets/
diff --git a/mlpf/heptfds/cms_pf/ttbar_nopu.py b/mlpf/heptfds/cms_pf/ttbar_nopu.py
index 0879ebb7f..26158b459 100644
--- a/mlpf/heptfds/cms_pf/ttbar_nopu.py
+++ b/mlpf/heptfds/cms_pf/ttbar_nopu.py
@@ -21,11 +21,12 @@
 class CmsPfTtbarNopu(tfds.core.GeneratorBasedBuilder):
     """DatasetBuilder for cms_pf_ttbar_nopu dataset."""
 
-    VERSION = tfds.core.Version("2.0.0")
+    VERSION = tfds.core.Version("2.2.0")
     RELEASE_NOTES = {
         "1.7.1": "First version",
         "1.8.0": "Add ispu, genjets, genmet; disable genjet_idx; improved merging",
         "2.0.0": "New truth def based primarily on CaloParticles",
+        "2.2.0": "Split CaloParticles along tracks",
     }
     MANUAL_DOWNLOAD_INSTRUCTIONS = """
     rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_ttbar_nopu ~/tensorflow_datasets/
diff --git a/mlpf/pyg/PFDataset.py b/mlpf/pyg/PFDataset.py
index 7ca195e43..22bcec6a4 100644
--- a/mlpf/pyg/PFDataset.py
+++ b/mlpf/pyg/PFDataset.py
@@ -35,12 +35,21 @@ def __getitem__(self, item):
             ret["ygen"] = ret["ygen"][sortidx]
 
         if self.ds.dataset_info.name.startswith("cms_"):
+            # track, target label neutral hadron -> reconstruct as charged hadron
             ret["ygen"][:, 0][(ret["X"][:, 0] == 1) & (ret["ygen"][:, 0] == 2)] = 1
+
+            # track, target label photon -> reconstruct as charged hadron
             ret["ygen"][:, 0][(ret["X"][:, 0] == 1) & (ret["ygen"][:, 0] == 5)] = 1
 
+            # ECAL cluster, target label charged hadron -> reconstruct as photon
             ret["ygen"][:, 0][(ret["X"][:, 0] == 4) & (ret["ygen"][:, 0] == 1)] = 5
+
+            # HCAL cluster, target label charged hadron -> reconstruct as neutral hadron
             ret["ygen"][:, 0][(ret["X"][:, 0] == 5) & (ret["ygen"][:, 0] == 1)] = 2
+
+            # ECAL cluster, target label electron -> reconstruct as photon
             # ret["ygen"][:, 0][(ret["X"][:, 0]==4) & (ret["ygen"][:, 0] == 6)] = 5
+
             ret["ygen"][:, 0][(ret["X"][:, 0] == 5) & (ret["ygen"][:, 0] == 6)] = 2
             ret["ygen"][:, 0][(ret["X"][:, 0] == 4) & (ret["ygen"][:, 0] == 7)] = 5
             ret["ygen"][:, 0][(ret["X"][:, 0] == 5) & (ret["ygen"][:, 0] == 7)] = 2
diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py
index abcb9c5a2..0100a9a72 100644
--- a/mlpf/pyg/training.py
+++ b/mlpf/pyg/training.py
@@ -78,6 +78,7 @@ def mlpf_loss(y, ypred, batch):
     loss = {}
     loss_obj_id = FocalLoss(gamma=2.0, reduction="none")
 
+    msk_pred_particle = torch.unsqueeze((ypred["cls_id"] != 0).to(dtype=torch.float32), axis=-1)
     msk_true_particle = torch.unsqueeze((y["cls_id"] != 0).to(dtype=torch.float32), axis=-1)
     nelem = torch.sum(batch.mask)
     npart = torch.sum(y["cls_id"] != 0)
@@ -126,13 +127,23 @@ def mlpf_loss(y, ypred, batch):
     pred_met = torch.sum(px, axis=-2) ** 2 + torch.sum(py, axis=-2) ** 2
 
     loss["MET"] = torch.nn.functional.huber_loss(pred_met.squeeze(dim=-1), batch.genmet).mean()
-    loss["Sliced_Wasserstein_Loss"] = sliced_wasserstein_loss(ypred["momentum"].detach(), y["momentum"]).mean()
 
-    loss["Total"] = loss["Classification_binary"] + loss["Classification"] + loss["Regression"]
+    was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze(
+        axis=-1
+    )
+    was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze(
+        axis=-1
+    )
+
+    std = was_input_true[batch.mask].std(axis=0)
+    loss["Sliced_Wasserstein_Loss"] = sliced_wasserstein_loss(was_input_pred / std, was_input_true / std).mean()
+
+    loss["Total"] = loss["Classification_binary"] + loss["Classification"] + loss["Regression"]  # + 0.01 * loss["Sliced_Wasserstein_Loss"]
 
     loss["Classification_binary"] = loss["Classification_binary"].detach()
     loss["Classification"] = loss["Classification"].detach()
     loss["Regression"] = loss["Regression"].detach()
+    loss["Sliced_Wasserstein_Loss"] = loss["Sliced_Wasserstein_Loss"].detach()
     return loss
 
 
diff --git a/notebooks/cms/cms-dataset.ipynb b/notebooks/cms/cms-dataset.ipynb
index 36706672f..62d985a2f 100644
--- a/notebooks/cms/cms-dataset.ipynb
+++ b/notebooks/cms/cms-dataset.ipynb
@@ -12,6 +12,7 @@
     "import awkward\n",
     "import numpy as np\n",
     "import fastjet\n",
+    "import tqdm\n",
     "\n",
     "import matplotlib.pyplot as plt"
    ]
@@ -44,7 +45,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "builder = tfds.builder(\"cms_pf_ttbar\", data_dir=\"/scratch/persistent/joosep/tensorflow_datasets/\")\n",
+    "ds_string = \"cms_pf_ttbar:2.2.0\"\n",
+    "builder = tfds.builder(ds_string, data_dir=\"/scratch/persistent/joosep/tensorflow_datasets/\")\n",
     "ds_train = builder.as_data_source(split=\"train\")"
    ]
   },
@@ -56,32 +58,43 @@
    "outputs": [],
    "source": [
     "all_genjets = []\n",
-    "all_genparticles = []\n",
+    "all_genparticles_awk = []\n",
+    "all_Xs = []\n",
     "\n",
     "#loop over some events in the dataset\n",
-    "for iev in range(100):\n",
+    "for iev in tqdm.tqdm(list(range(5000))):\n",
     "    el = ds_train[iev]\n",
-    "    print(len(el[\"X\"]), el.keys())\n",
+    "    # print(len(el[\"X\"]), el.keys())\n",
     "    \n",
     "    genjets = vector.awk(awkward.zip({\"pt\": el[\"genjets\"][:, 0], \"eta\": el[\"genjets\"][:, 1], \"phi\": el[\"genjets\"][:, 2], \"e\": el[\"genjets\"][:, 3]}))\n",
-    "    mask_genparticles = el[\"ygen\"][:, 0]!=0\n",
-    "    genparticles = el[\"ygen\"][mask_genparticles]\n",
+    "    genparticles = el[\"ygen\"]\n",
     "    \n",
     "    gp_phi = np.arctan2(genparticles[:, 4], genparticles[:, 5]) #sphi,cphi -> phi\n",
     "    genparticles_p4 = vector.awk(awkward.zip({\"pt\": genparticles[:, 2], \"eta\": genparticles[:, 3], \"phi\": gp_phi, \"e\": genparticles[:, 6]}))\n",
     "    gp_ispu = genparticles[:, 7]\n",
     "    gp_pid = np.array(CLASS_LABELS_CMS)[genparticles[:, 0].astype(np.int64)]\n",
-    "    genparticles = awkward.Record({\n",
+    "    genparticles_awk = awkward.Array({\n",
     "        \"pid\": gp_pid,\n",
     "        \"p4\": genparticles_p4,\n",
     "        \"ispu\": genparticles[:, 7],\n",
     "    })\n",
     "\n",
+    "    all_Xs.append(el[\"X\"])\n",
     "    all_genjets.append(genjets)\n",
-    "    all_genparticles.append(genparticles)\n",
-    "\n",
-    "all_genjets = awkward.from_iter(all_genjets)\n",
-    "all_genparticles = awkward.from_iter(all_genparticles)"
+    "    all_genparticles_awk.append(genparticles_awk)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "815302bd-3e2b-45e9-8263-de09265db7bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_Xs = awkward.unflatten(awkward.from_numpy(np.concatenate(all_Xs, axis=0)), counts=[len(x) for x in all_Xs])\n",
+    "all_genjets = awkward.unflatten(awkward.concatenate(all_genjets), counts=[len(x) for x in all_genjets])\n",
+    "all_genparticles_awk = awkward.unflatten(awkward.concatenate(all_genparticles_awk), counts=[len(x) for x in all_genparticles_awk])\n",
+    "all_genparticles_no0 = all_genparticles_awk[all_genparticles_awk[\"pid\"]!=0]"
    ]
   },
   {
@@ -94,10 +107,10 @@
     "p4 = vector.awk(\n",
     "    awkward.zip(\n",
     "        {\n",
-    "            \"pt\": all_genparticles.p4.rho,\n",
-    "            \"eta\": all_genparticles.p4.eta,\n",
-    "            \"phi\": all_genparticles.p4.phi,\n",
-    "            \"e\": all_genparticles.p4.t,\n",
+    "            \"pt\": all_genparticles_no0.p4.rho,\n",
+    "            \"eta\": all_genparticles_no0.p4.eta,\n",
+    "            \"phi\": all_genparticles_no0.p4.phi,\n",
+    "            \"e\": all_genparticles_no0.p4.t,\n",
     "        }\n",
     "    )\n",
     ")"
@@ -112,10 +125,10 @@
    "source": [
     "jetdef = fastjet.JetDefinition(fastjet.antikt_algorithm, 0.4)\n",
     "cluster = fastjet.ClusterSequence(p4.to_xyzt(), jetdef)\n",
-    "jets = cluster.inclusive_jets(min_pt=10)\n",
+    "jets = cluster.inclusive_jets(min_pt=5)\n",
     "\n",
-    "cluster = fastjet.ClusterSequence(p4.to_xyzt()[all_genparticles.ispu==0], jetdef)\n",
-    "jets_nopu = cluster.inclusive_jets(min_pt=10)"
+    "cluster = fastjet.ClusterSequence(p4.to_xyzt()[all_genparticles_no0.ispu==0], jetdef)\n",
+    "jets_nopu = cluster.inclusive_jets(min_pt=5)"
    ]
   },
   {
@@ -125,12 +138,86 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "b = np.linspace(10,100,100)\n",
+    "b = np.logspace(0,3,100)\n",
     "plt.hist(awkward.flatten(all_genjets.rho), bins=b, histtype=\"step\", label=\"genjets\");\n",
     "plt.hist(awkward.flatten(jets.pt), bins=b, histtype=\"step\", label=\"all gp jets\");\n",
     "plt.hist(awkward.flatten(jets_nopu.pt), bins=b, histtype=\"step\", label=\"ispu=0 gp jets\");\n",
-    "plt.legend()"
+    "plt.legend()\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.xlabel(\"jet pt\")\n",
+    "plt.ylabel(\"number of jets\")\n",
+    "plt.title(ds_string)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd105f3e-61af-46f1-bb84-da196f57a391",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gen_pid = awkward.flatten(all_genparticles_awk[all_Xs[:, :, 0]==1][\"pid\"])\n",
+    "gen_pt = awkward.flatten(all_genparticles_awk[all_Xs[:, :, 0]==1][\"p4\"].rho)\n",
+    "track_pt = awkward.flatten(all_Xs[all_Xs[:, :, 0]==1][:, :, 1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ae5d52d-f1c3-4caa-816a-a38453812998",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = np.logspace(-1,2,100)\n",
+    "\n",
+    "plt.figure(figsize=(6,6))\n",
+    "plt.hist2d(\n",
+    "    awkward.to_numpy(track_pt[gen_pid!=0]),\n",
+    "    awkward.to_numpy(gen_pt[gen_pid!=0]),\n",
+    "    bins=b\n",
+    ")\n",
+    "plt.plot([0.1, 100], [0.1, 100], color=\"black\", ls=\"--\", lw=1.0)\n",
+    "plt.xscale(\"log\")\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xlabel(\"track pt\")\n",
+    "plt.ylabel(\"gen pt\")\n",
+    "plt.title(ds_string)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c7d5bf0-a752-43f5-b849-b979c0118388",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bins = np.logspace(-1, 3, 20)\n",
+    "fracs_gen = []\n",
+    "\n",
+    "for ibin in range(len(bins)-1):\n",
+    "    b0 = bins[ibin]\n",
+    "    b1 = bins[ibin+1]\n",
+    "    msk = (track_pt >= b0) & (track_pt < b1)\n",
+    "    frac_gen = np.sum(gen_pid[msk]!=0) / np.sum(msk)\n",
+    "    fracs_gen.append(frac_gen)\n",
+    "\n",
+    "plt.figure()\n",
+    "plt.plot(bins[:-1], fracs_gen, marker=\"o\")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.xlabel(\"track pT\")\n",
+    "plt.ylabel(\"fraction of tracks matched to gen\")\n",
+    "plt.ylim(0,1)\n",
+    "plt.title(ds_string)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "baa4bb0c-35bf-4232-8b93-15b80ba5f20b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -149,7 +236,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.14"
+   "version": "3.11.9"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/cms/cms-simvalidation.ipynb b/notebooks/cms/cms-simvalidation.ipynb
index 7485c2851..39cb600d6 100644
--- a/notebooks/cms/cms-simvalidation.ipynb
+++ b/notebooks/cms/cms-simvalidation.ipynb
@@ -30,6 +30,8 @@
     "import bz2\n",
     "import os\n",
     "import tqdm\n",
+    "import fastjet\n",
+    "import vector\n",
     "\n",
     "mplhep.style.use(\"CMS\")"
    ]
@@ -62,6 +64,9 @@
    "outputs": [],
    "source": [
     "import sys\n",
+    "sys.path += [\"../../mlpf/\"]\n",
+    "\n",
+    "import jet_utils\n",
     "\n",
     "sys.path += [\"../../mlpf/plotting/\"]\n",
     "\n",
@@ -71,16 +76,6 @@
     "from plot_utils import pid_to_text"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ce1e996f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!ls -lrt /local/joosep/mlpf/cms/20240702_cptruthdef/nopu/TTbar_14TeV_TuneCUETP8M1_cfi/raw"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -90,7 +85,7 @@
    "source": [
     "sample = \"TTbar_14TeV_TuneCUETP8M1_cfi\"\n",
     "\n",
-    "maxfiles = 50\n",
+    "maxfiles = 100\n",
     "if sample.startswith(\"Single\"):\n",
     "    maxfiles = 50\n",
     "\n",
@@ -99,6 +94,16 @@
     "    os.makedirs(plot_outpath)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "020e1e9d-058e-473b-933d-2c8c965b04ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list(glob.glob(\"/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/{}/raw2/*.pkl.bz2\".format(sample)))[:maxfiles][0]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -109,7 +114,7 @@
     "pickle_data = sum(\n",
     "    [\n",
     "        pickle.load(bz2.BZ2File(f, \"r\"))\n",
-    "        for f in tqdm.tqdm(list(glob.glob(\"/local/joosep/mlpf/cms/20240702_cptruthdef/nopu/{}/*/*.pkl.bz2\".format(sample)))[:maxfiles])\n",
+    "        for f in tqdm.tqdm(list(glob.glob(\"/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/{}/raw2/*.pkl.bz2\".format(sample)))[:maxfiles])\n",
     "    ],\n",
     "    [],\n",
     ")\n",
@@ -130,7 +135,7 @@
    "source": [
     "arrs_awk = {}\n",
     "arrs_flat = {}\n",
-    "for coll in [\"Xelem\", \"ygen\", \"ycand\"]:\n",
+    "for coll in [\"Xelem\"]:\n",
     "    arrs_awk[coll] = {}\n",
     "    arrs_flat[coll] = {}\n",
     "    for feat in [\"typ\", \"pt\", \"eta\", \"phi\", \"e\"]:\n",
@@ -139,9 +144,18 @@
     "        )\n",
     "        arrs_flat[coll][feat] = awkward.from_regular([np.array(p[coll][feat].tolist()) for p in pickle_data])\n",
     "\n",
+    "for coll in [\"ygen\", \"ycand\"]:\n",
+    "    arrs_awk[coll] = {}\n",
+    "    arrs_flat[coll] = {}\n",
+    "    for feat in [\"pid\", \"pt\", \"eta\", \"phi\", \"e\"]:\n",
+    "        arrs_awk[coll][feat] = awkward.from_regular(\n",
+    "            [np.array(p[coll][feat][p[coll][\"pid\"] != 0].tolist()) for p in pickle_data]\n",
+    "        )\n",
+    "        arrs_flat[coll][feat] = awkward.from_regular([np.array(p[coll][feat].tolist()) for p in pickle_data])\n",
+    "\n",
     "if \"pythia\" in pickle_data[0].keys():\n",
     "    arrs_flat[\"pythia\"] = {}\n",
-    "    for ifeat, feat in enumerate([\"typ\", \"pt\", \"eta\", \"phi\", \"e\"]):\n",
+    "    for ifeat, feat in enumerate([\"pid\", \"pt\", \"eta\", \"phi\", \"e\"]):\n",
     "        #         arrs_awk[\"pythia\"][feat] = awkward.from_regular(\n",
     "        #             [np.array(p[\"pythia\"][:, ifeat][p[coll][:, 0]!=0].tolist()) for p in pickle_data]\n",
     "        #         )\n",
@@ -152,7 +166,180 @@
     "            )\n",
     "            arr.append(np.array(p[\"pythia\"][:, ifeat][mask_invis].tolist()))\n",
     "\n",
-    "        arrs_flat[\"pythia\"][feat] = awkward.from_regular(arr)"
+    "        arrs_flat[\"pythia\"][feat] = awkward.from_regular(arr)\n",
+    "\n",
+    "\n",
+    "genmet_cmssw = np.array([pickle_data[i][\"genmet\"][0, 0] for i in range(len(pickle_data))])\n",
+    "genjet_cmssw = awkward.from_iter([pickle_data[i][\"genjet\"] for i in range(len(pickle_data))])\n",
+    "genjet_cmssw = vector.awk(\n",
+    "    awkward.zip(\n",
+    "        {   \n",
+    "            \"pt\": genjet_cmssw[:, :, 0],\n",
+    "            \"eta\": genjet_cmssw[:, :, 1],\n",
+    "            \"phi\": genjet_cmssw[:, :, 2],\n",
+    "            \"e\": genjet_cmssw[:, :, 3],\n",
+    "        }\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "ygen_met = np.sqrt(awkward.sum(\n",
+    "    (arrs_awk[\"ygen\"][\"pt\"] * np.sin(arrs_awk[\"ygen\"][\"phi\"]))**2 + (arrs_awk[\"ygen\"][\"pt\"] * np.cos(arrs_awk[\"ygen\"][\"phi\"]))**2,\n",
+    "    axis=1\n",
+    "))\n",
+    "\n",
+    "ycand_met = np.sqrt(awkward.sum(\n",
+    "    (arrs_awk[\"ycand\"][\"pt\"] * np.sin(arrs_awk[\"ycand\"][\"phi\"]))**2 + (arrs_awk[\"ycand\"][\"pt\"] * np.cos(arrs_awk[\"ycand\"][\"phi\"]))**2,\n",
+    "    axis=1\n",
+    "))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3808c046-7b2b-48c2-9952-f52961f70fde",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "msk_trk = (arrs_flat[\"Xelem\"][\"typ\"]==1) & (arrs_flat[\"Xelem\"][\"pt\"]>10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c25a98f9-e965-4a48-a69e-78ae6493d371",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "arrs_flat[\"ygen\"][\"pid\"][msk_trk]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f91ede7c-d9f0-4f27-95c6-fcffdcc73ee6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "arrs_flat[\"ycand\"][\"pid\"][msk_trk]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "66276184-9ae1-49ed-803c-929ec7aeb9e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "jets_coll = {}\n",
+    "jets_coll[\"cmssw\"] = genjet_cmssw\n",
+    "\n",
+    "for coll in [\"ygen\", \"ycand\"]:\n",
+    "    vec = vector.awk(\n",
+    "        awkward.zip(\n",
+    "            {   \n",
+    "                \"pt\": arrs_awk[coll][\"pt\"],\n",
+    "                \"eta\": arrs_awk[coll][\"eta\"],\n",
+    "                \"phi\": arrs_awk[coll][\"phi\"],\n",
+    "                \"e\": arrs_awk[coll][\"e\"],\n",
+    "            }\n",
+    "        )\n",
+    "    )\n",
+    "    jetdef = fastjet.JetDefinition(fastjet.antikt_algorithm, 0.4)\n",
+    "    cluster = fastjet.ClusterSequence(vec.to_xyzt(), jetdef)\n",
+    "    jets_coll[coll] = cluster.inclusive_jets(min_pt=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24de1b68-cc15-4d33-8465-db85c79d5e65",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cmssw_to_ygen = jet_utils.match_two_jet_collections(jets_coll, \"cmssw\", \"ygen\", 0.1)\n",
+    "cmssw_to_ycand = jet_utils.match_two_jet_collections(jets_coll, \"cmssw\", \"ycand\", 0.1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e89ca76e-7c4f-4db4-b309-5a167e7bf31b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(len(awkward.flatten(cmssw_to_ygen[\"cmssw\"])))\n",
+    "print(len(awkward.flatten(cmssw_to_ycand[\"cmssw\"])))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d80c62f4-0105-4e62-ab2f-2da42d0f8f15",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(10,10))\n",
+    "b = np.linspace(0,3,100)\n",
+    "plt.hist(\n",
+    "    awkward.flatten(\n",
+    "        jets_coll[\"ygen\"][cmssw_to_ygen[\"ygen\"]].pt / jets_coll[\"cmssw\"][cmssw_to_ygen[\"cmssw\"]].pt\n",
+    "    ), bins=b, histtype=\"step\", lw=2\n",
+    ")\n",
+    "\n",
+    "plt.hist(\n",
+    "    awkward.flatten(\n",
+    "        jets_coll[\"ycand\"][cmssw_to_ycand[\"ycand\"]].pt / jets_coll[\"cmssw\"][cmssw_to_ycand[\"cmssw\"]].pt\n",
+    "    ), bins=b, histtype=\"step\", lw=2\n",
+    ")\n",
+    "plt.yscale(\"log\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee6da234-1e8b-4ac5-8608-1c82c481c232",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(10, 10))\n",
+    "b = np.logspace(1, 3, 101)\n",
+    "plt.hist(genmet_cmssw, bins=b, histtype=\"step\", lw=2)\n",
+    "plt.hist(ygen_met, bins=b, histtype=\"step\", lw=2)\n",
+    "plt.hist(ycand_met, bins=b, histtype=\"step\", lw=2)\n",
+    "plt.xscale(\"log\")\n",
+    "plt.yscale(\"log\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e916f8dd-a0a8-4aa9-a5b2-56fb7ad406c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(10, 10))\n",
+    "b = np.logspace(1, 4, 61)\n",
+    "plt.hist((ygen_met/genmet_cmssw)[genmet_cmssw<1], bins=b, histtype=\"step\", lw=2)\n",
+    "plt.hist((ycand_met/genmet_cmssw)[genmet_cmssw<1], bins=b, histtype=\"step\", lw=2)\n",
+    "plt.xscale(\"log\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "953ad71f-88a3-48d2-90ee-986e2f07a7ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(10, 10))\n",
+    "b = np.logspace(-1,2,101)\n",
+    "plt.hist((ygen_met/genmet_cmssw)[genmet_cmssw>1], bins=b, histtype=\"step\", lw=2)\n",
+    "plt.hist((ycand_met/genmet_cmssw)[genmet_cmssw>1], bins=b, histtype=\"step\", lw=2)\n",
+    "plt.xscale(\"log\")\n",
+    "plt.show()"
    ]
   },
   {
@@ -189,6 +376,7 @@
     "    cms_label(ax)\n",
     "    plt.ylim(1, 1e5)\n",
     "    #sample_label(ax, sample)\n",
+    "    plt.show()\n",
     "    plt.savefig(plot_outpath + \"all_pt.pdf\", bbox_inches=\"tight\")"
    ]
   },
@@ -214,6 +402,7 @@
     "    cms_label(ax)\n",
     "    #sample_label(ax, sample)\n",
     "    plt.ylim(1, 1e3)\n",
+    "    plt.show()\n",
     "    plt.savefig(plot_outpath + \"all_sume.pdf\", bbox_inches=\"tight\")"
    ]
   },
@@ -332,9 +521,9 @@
     "    0\n",
     "]:\n",
     "    if pid == 0:\n",
-    "        msk = arrs_flat[\"ygen\"][\"typ\"] != pid\n",
+    "        msk = arrs_flat[\"ygen\"][\"pid\"] != pid\n",
     "    else:\n",
-    "        msk = arrs_flat[\"ygen\"][\"typ\"] == pid\n",
+    "        msk = arrs_flat[\"ygen\"][\"pid\"] == pid\n",
     "    data1 = awkward.to_numpy(awkward.flatten(arrs_flat[\"Xelem\"][\"eta\"][msk]))\n",
     "    data2 = awkward.to_numpy(awkward.flatten(arrs_flat[\"ygen\"][\"eta\"][msk]))\n",
     "\n",
@@ -416,28 +605,17 @@
    "source": [
     "Xelem_typ_f = np.array(awkward.flatten(arrs_flat[\"Xelem\"][\"typ\"]))\n",
     "\n",
-    "ygen_typ_f = np.array(awkward.flatten(arrs_flat[\"ygen\"][\"typ\"]))\n",
+    "ygen_typ_f = np.array(awkward.flatten(arrs_flat[\"ygen\"][\"pid\"]))\n",
     "ygen_typ_id = np.zeros(len(ygen_typ_f), dtype=np.int32)\n",
     "for i in range(len(CLASS_LABELS_CMS)):\n",
     "    ygen_typ_id[ygen_typ_f == CLASS_LABELS_CMS[i]] = i\n",
     "\n",
-    "ycand_typ_f = np.array(awkward.flatten(arrs_flat[\"ycand\"][\"typ\"]))\n",
+    "ycand_typ_f = np.array(awkward.flatten(arrs_flat[\"ycand\"][\"pid\"]))\n",
     "ycand_typ_id = np.zeros(len(ycand_typ_f), dtype=np.int32)\n",
     "for i in range(len(CLASS_LABELS_CMS)):\n",
     "    ycand_typ_id[ycand_typ_f == CLASS_LABELS_CMS[i]] = i"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4fd92d6f-bb46-4634-ae0c-2fe9de614a89",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "ygen_typ_f"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -455,7 +633,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "np.unique(Xelem_typ_f[ygen_typ_f == 11], return_counts=True)"
+    "np.unique(ygen_typ_id[Xelem_typ_f == 9], return_counts=True)"
    ]
   },
   {
@@ -522,6 +700,7 @@
     "\n",
     "plt.xlabel(\"$p_T$ [GeV]\")\n",
     "plt.ylabel(\"Number of particles\")\n",
+    "plt.show()\n",
     "plt.savefig(plot_outpath + \"pf_vs_truth_pt.pdf\", bbox_inches=\"tight\")"
    ]
   },
@@ -550,6 +729,7 @@
     "\n",
     "plt.xlabel(\"particle $\\eta$\")\n",
     "plt.ylabel(\"Number of particles\")\n",
+    "plt.show()\n",
     "plt.savefig(plot_outpath + \"pf_vs_truth_eta.pdf\", bbox_inches=\"tight\")"
    ]
   },
@@ -564,11 +744,11 @@
     "ax = plt.axes()\n",
     "b = np.logspace(-2, 4, 101)\n",
     "hs = []\n",
-    "pids = sorted(np.unique(awkward.flatten(arrs_awk[\"ygen\"][\"typ\"])).tolist())\n",
+    "pids = sorted(np.unique(awkward.flatten(arrs_awk[\"ygen\"][\"pid\"])).tolist())\n",
     "colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n",
     "labels = []\n",
     "for pid in pids[::-1]:\n",
-    "    pt_pid = awkward.to_numpy(awkward.to_numpy(awkward.flatten(arrs_awk[\"ygen\"][\"pt\"][arrs_awk[\"ygen\"][\"typ\"] == pid])))\n",
+    "    pt_pid = awkward.to_numpy(awkward.to_numpy(awkward.flatten(arrs_awk[\"ygen\"][\"pt\"][arrs_awk[\"ygen\"][\"pid\"] == pid])))\n",
     "    hs.append(np.histogram(pt_pid, bins=b))\n",
     "    labels.append(CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n",
     "mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n",
@@ -576,7 +756,7 @@
     "plt.xscale(\"log\")\n",
     "plt.ylim(0, 1.2 * np.sum([h[0] for h in hs], axis=0).max())\n",
     "if sample == \"TTbar_14TeV_TuneCUETP8M1_cfi\":\n",
-    "    plt.ylim(0, 1.5e6)\n",
+    "    plt.ylim(0, 1e5)\n",
     "\n",
     "plt.ticklabel_format(style=\"sci\", axis=\"y\", scilimits=(0, 0))\n",
     "ax.yaxis.major.formatter._useMathText = True\n",
@@ -588,6 +768,7 @@
     "cms_label(ax)\n",
     "#sample_label(ax, sample, \", MLPF truth\")\n",
     "plt.xlim(10**-2, 10**4)\n",
+    "plt.show()\n",
     "plt.savefig(plot_outpath + \"truth_pt.pdf\", bbox_inches=\"tight\")"
    ]
   },
@@ -602,17 +783,17 @@
     "ax = plt.axes()\n",
     "b = np.linspace(-6, 6, 41)\n",
     "hs = []\n",
-    "pids = sorted(np.unique(awkward.flatten(arrs_awk[\"ygen\"][\"typ\"])).tolist())\n",
+    "pids = sorted(np.unique(awkward.flatten(arrs_awk[\"ygen\"][\"pid\"])).tolist())\n",
     "colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n",
     "labels = []\n",
     "for pid in pids[::-1]:\n",
-    "    pt_pid = awkward.to_numpy(awkward.flatten(arrs_awk[\"ygen\"][\"eta\"][arrs_awk[\"ygen\"][\"typ\"] == pid]))\n",
+    "    pt_pid = awkward.to_numpy(awkward.flatten(arrs_awk[\"ygen\"][\"eta\"][arrs_awk[\"ygen\"][\"pid\"] == pid]))\n",
     "    hs.append(np.histogram(pt_pid, bins=b))\n",
     "    labels.append(CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n",
     "mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n",
     "plt.ylim(0, 1.5 * np.sum([h[0] for h in hs], axis=0).max())\n",
     "if sample == \"TTbar_14TeV_TuneCUETP8M1_cfi\":\n",
-    "    plt.ylim(0, 1e6)\n",
+    "    plt.ylim(0, 1e5)\n",
     "plt.ticklabel_format(style=\"sci\", axis=\"y\", scilimits=(0, 0))\n",
     "ax.yaxis.major.formatter._useMathText = True\n",
     "\n",
@@ -625,6 +806,7 @@
     "cms_label(ax)\n",
     "#sample_label(ax, sample, \", MLPF truth\")\n",
     "plt.xlim(-6, 6)\n",
+    "plt.show()\n",
     "plt.savefig(plot_outpath + \"truth_eta.pdf\", bbox_inches=\"tight\")"
    ]
   },
@@ -639,11 +821,11 @@
     "ax = plt.axes()\n",
     "b = np.logspace(-2, 4, 101)\n",
     "hs = []\n",
-    "pids = sorted(np.unique(awkward.flatten(arrs_awk[\"ycand\"][\"typ\"])).tolist())\n",
+    "pids = sorted(np.unique(awkward.flatten(arrs_awk[\"ycand\"][\"pid\"])).tolist())\n",
     "colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n",
     "labels = []\n",
     "for pid in pids[::-1]:\n",
-    "    pt_pid = awkward.to_numpy(awkward.flatten(arrs_awk[\"ycand\"][\"pt\"][arrs_awk[\"ycand\"][\"typ\"] == pid]))\n",
+    "    pt_pid = awkward.to_numpy(awkward.flatten(arrs_awk[\"ycand\"][\"pt\"][arrs_awk[\"ycand\"][\"pid\"] == pid]))\n",
     "    hs.append(np.histogram(pt_pid, bins=b))\n",
     "    labels.append(CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n",
     "mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n",
@@ -651,7 +833,7 @@
     "plt.xscale(\"log\")\n",
     "plt.ylim(0, 1.2 * np.sum([h[0] for h in hs], axis=0).max())\n",
     "if sample == \"TTbar_14TeV_TuneCUETP8M1_cfi\":\n",
-    "    plt.ylim(0, 1.5e6)\n",
+    "    plt.ylim(0, 1e5)\n",
     "plt.ticklabel_format(style=\"sci\", axis=\"y\", scilimits=(0, 0))\n",
     "ax.yaxis.major.formatter._useMathText = True\n",
     "\n",
@@ -662,6 +844,7 @@
     "cms_label(ax)\n",
     "#sample_label(ax, sample, \", PF\")\n",
     "plt.xlim(10**-2, 10**4)\n",
+    "plt.show()\n",
     "plt.savefig(plot_outpath + \"pf_pt.pdf\", bbox_inches=\"tight\")"
    ]
   },
@@ -676,11 +859,11 @@
     "ax = plt.axes()\n",
     "b = np.linspace(-6, 6, 41)\n",
     "hs = []\n",
-    "pids = sorted(np.unique(awkward.flatten(arrs_awk[\"ycand\"][\"typ\"])).tolist())\n",
+    "pids = sorted(np.unique(awkward.flatten(arrs_awk[\"ycand\"][\"pid\"])).tolist())\n",
     "colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n",
     "labels = []\n",
     "for pid in pids[::-1]:\n",
-    "    pt_pid = awkward.to_numpy(awkward.flatten(arrs_awk[\"ycand\"][\"eta\"][arrs_awk[\"ycand\"][\"typ\"] == pid]))\n",
+    "    pt_pid = awkward.to_numpy(awkward.flatten(arrs_awk[\"ycand\"][\"eta\"][arrs_awk[\"ycand\"][\"pid\"] == pid]))\n",
     "    hs.append(np.histogram(pt_pid, bins=b))\n",
     "    labels.append(CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n",
     "mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n",
@@ -688,7 +871,7 @@
     "# plt.xscale(\"log\")\n",
     "plt.ylim(0, 1.5 * np.sum([h[0] for h in hs], axis=0).max())\n",
     "if sample == \"TTbar_14TeV_TuneCUETP8M1_cfi\":\n",
-    "    plt.ylim(0, 1e6)\n",
+    "    plt.ylim(0, 1e5)\n",
     "plt.ticklabel_format(style=\"sci\", axis=\"y\", scilimits=(0, 0))\n",
     "ax.yaxis.major.formatter._useMathText = True\n",
     "\n",
@@ -699,9 +882,20 @@
     "cms_label(ax)\n",
     "#sample_label(ax, sample, \", PF\")\n",
     "plt.xlim(-6, 6)\n",
+    "plt.show()\n",
     "plt.savefig(plot_outpath + \"pf_eta.pdf\", bbox_inches=\"tight\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8ce54f2f-a611-401d-b015-3baa1d7a1cdd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "arrs_flat[\"pythia\"]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -714,11 +908,11 @@
     "    ax = plt.axes()\n",
     "    b = np.logspace(-2, 5, 101)\n",
     "    hs = []\n",
-    "    pids = sorted(np.unique(awkward.flatten(arrs_flat[\"pythia\"][\"typ\"])).tolist())\n",
+    "    pids = sorted(np.unique(awkward.flatten(arrs_flat[\"pythia\"][\"pid\"])).tolist())\n",
     "    colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n",
     "    labels = []\n",
     "    for pid in pids[::-1]:\n",
-    "        pt_pid = awkward.to_numpy(awkward.flatten(arrs_flat[\"pythia\"][\"pt\"][arrs_flat[\"pythia\"][\"typ\"] == pid]))\n",
+    "        pt_pid = awkward.to_numpy(awkward.flatten(arrs_flat[\"pythia\"][\"pt\"][arrs_flat[\"pythia\"][\"pid\"] == pid]))\n",
     "        hs.append(np.histogram(pt_pid, bins=b))\n",
     "        labels.append(int(pid))\n",
     "    mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n",
@@ -730,6 +924,7 @@
     "    # plt.title(\"{}\\nMLPF truth\".format(sample))\n",
     "    cms_label(ax)\n",
     "    #sample_label(ax, sample, \", Pythia\")\n",
+    "    plt.show()\n",
     "    plt.savefig(plot_outpath + \"pythia_pt.pdf\", bbox_inches=\"tight\")"
    ]
   },
@@ -745,11 +940,11 @@
     "    ax = plt.axes()\n",
     "    b = np.linspace(-6, 6, 101)\n",
     "    hs = []\n",
-    "    pids = sorted(np.unique(awkward.flatten(arrs_flat[\"pythia\"][\"typ\"])).tolist())\n",
+    "    pids = sorted(np.unique(awkward.flatten(arrs_flat[\"pythia\"][\"pid\"])).tolist())\n",
     "    colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n",
     "    labels = []\n",
     "    for pid in pids[::-1]:\n",
-    "        pt_pid = awkward.to_numpy(awkward.flatten(arrs_flat[\"pythia\"][\"eta\"][arrs_flat[\"pythia\"][\"typ\"] == pid]))\n",
+    "        pt_pid = awkward.to_numpy(awkward.flatten(arrs_flat[\"pythia\"][\"eta\"][arrs_flat[\"pythia\"][\"pid\"] == pid]))\n",
     "        hs.append(np.histogram(pt_pid, bins=b))\n",
     "        labels.append(int(pid))\n",
     "    mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n",
@@ -761,6 +956,7 @@
     "    # plt.title(\"{}\\nMLPF truth\".format(sample))\n",
     "    cms_label(ax)\n",
     "    #sample_label(ax, sample, \", Pythia\")\n",
+    "    plt.show()\n",
     "    plt.savefig(plot_outpath + \"pythia_eta.pdf\", bbox_inches=\"tight\")"
    ]
   },
@@ -776,11 +972,11 @@
     "    plt.figure()\n",
     "    ax = plt.axes()\n",
     "    plt.hist(\n",
-    "        awkward.to_numpy(awkward.flatten(arrs_awk[\"ycand\"][\"pt\"][arrs_awk[\"ycand\"][\"typ\"] == pid])),\n",
+    "        awkward.to_numpy(awkward.flatten(arrs_awk[\"ycand\"][\"pt\"][arrs_awk[\"ycand\"][\"pid\"] == pid])),\n",
     "        bins=b, histtype=\"step\", lw=2, label=\"PF\"\n",
     "    )\n",
     "    plt.hist(\n",
-    "        awkward.to_numpy(awkward.flatten(arrs_awk[\"ygen\"][\"pt\"][arrs_awk[\"ygen\"][\"typ\"] == pid])),\n",
+    "        awkward.to_numpy(awkward.flatten(arrs_awk[\"ygen\"][\"pt\"][arrs_awk[\"ygen\"][\"pid\"] == pid])),\n",
     "        bins=b,\n",
     "        histtype=\"step\",\n",
     "        lw=2,\n",
@@ -793,6 +989,7 @@
     "    plt.xlabel(\"$p_T$ [GeV]\")\n",
     "    cms_label(ax)\n",
     "    #sample_label(ax, sample)\n",
+    "    plt.show()\n",
     "    plt.savefig(plot_outpath + \"pid{}_pt.pdf\".format(pid), bbox_inches=\"tight\")"
    ]
   },
@@ -808,13 +1005,13 @@
     "    plt.figure()\n",
     "    ax = plt.axes()\n",
     "    plt.hist(\n",
-    "        awkward.flatten(arrs_awk[\"ycand\"][\"eta\"][arrs_awk[\"ycand\"][\"typ\"] == pid]),\n",
-    "        weights=awkward.flatten(arrs_awk[\"ycand\"][\"e\"][arrs_awk[\"ycand\"][\"typ\"] == pid]),\n",
+    "        awkward.flatten(arrs_awk[\"ycand\"][\"eta\"][arrs_awk[\"ycand\"][\"pid\"] == pid]),\n",
+    "        weights=awkward.flatten(arrs_awk[\"ycand\"][\"e\"][arrs_awk[\"ycand\"][\"pid\"] == pid]),\n",
     "        bins=b, histtype=\"step\", lw=2, label=\"PF\"\n",
     "    )\n",
     "    plt.hist(\n",
-    "        awkward.flatten(arrs_awk[\"ygen\"][\"eta\"][arrs_awk[\"ygen\"][\"typ\"] == pid]),\n",
-    "        weights=awkward.flatten(arrs_awk[\"ygen\"][\"e\"][arrs_awk[\"ygen\"][\"typ\"] == pid]),\n",
+    "        awkward.flatten(arrs_awk[\"ygen\"][\"eta\"][arrs_awk[\"ygen\"][\"pid\"] == pid]),\n",
+    "        weights=awkward.flatten(arrs_awk[\"ygen\"][\"e\"][arrs_awk[\"ygen\"][\"pid\"] == pid]),\n",
     "        bins=b,\n",
     "        histtype=\"step\",\n",
     "        lw=2,\n",
@@ -823,21 +1020,12 @@
     "    plt.title(CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n",
     "    plt.legend(ncol=1, loc=(0.68, 0.8))\n",
     "    plt.xlabel(\"particle $\\eta$\")\n",
-    "    cms_label(ax)\n",
-    "    sample_label(ax, sample)\n",
+    "    # cms_label(ax)\n",
+    "    # sample_label(ax, sample)\n",
+    "    plt.show()\n",
     "    plt.savefig(plot_outpath + \"pid{}_eta.pdf\".format(pid), bbox_inches=\"tight\")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ca5207b1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mask = arrs_flat[\"ygen\"][\"e\"] > 0  # & (np.abs(arrs_flat[\"ycand\"][\"e\"]-arrs_flat[\"ygen\"][\"e\"])<500)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -865,6 +1053,7 @@
     "    plt.yscale(\"log\")\n",
     "    plt.xlabel(\"Pythia $\\sum E$ [GeV]\")\n",
     "    plt.ylabel(\"MLPF truth $\\sum E$ [GeV]\")\n",
+    "    plt.show()\n",
     "    plt.savefig(plot_outpath + \"pythia_vs_mlpf_sume.pdf\", bbox_inches=\"tight\")"
    ]
   },
@@ -895,21 +1084,74 @@
     "    plt.yscale(\"log\")\n",
     "    plt.xlabel(\"Pythia $\\sum E$ [GeV]\")\n",
     "    plt.ylabel(\"PF $\\sum E$ [GeV]\")\n",
+    "    plt.show()\n",
     "    plt.savefig(plot_outpath + \"pythia_vs_pf_sume.pdf\", bbox_inches=\"tight\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "19b4a45d",
+   "id": "c4efcc82-9e35-4a50-aaaa-f04b57f3eadc",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "gen_pid = awkward.flatten(arrs_flat[\"ygen\"][\"pid\"][arrs_flat[\"Xelem\"][\"typ\"]==1])\n",
+    "cand_pid = awkward.flatten(arrs_flat[\"ycand\"][\"pid\"][arrs_flat[\"Xelem\"][\"typ\"]==1])\n",
+    "track_pt = awkward.flatten(arrs_flat[\"Xelem\"][\"pt\"][arrs_flat[\"Xelem\"][\"typ\"]==1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b950f4a-178d-459e-b43b-7ed2c9b91b96",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "plt.hist(track_pt, bins=np.logspace(-2,3,100));\n",
+    "plt.xscale(\"log\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f5ee921-04c7-4691-b5ce-ee217b93d567",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bins = np.logspace(-1, 3, 20)\n",
+    "fracs_gen = []\n",
+    "fracs_cand = []\n",
+    "\n",
+    "for ibin in range(len(bins)-1):\n",
+    "    b0 = bins[ibin]\n",
+    "    b1 = bins[ibin+1]\n",
+    "    msk = (track_pt >= b0) & (track_pt < b1)\n",
+    "    frac_gen = np.sum(gen_pid[msk]!=0) / np.sum(msk)\n",
+    "    frac_cand = np.sum(cand_pid[msk]!=0) / np.sum(msk)\n",
+    "    fracs_gen.append(frac_gen)\n",
+    "    fracs_cand.append(frac_cand)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bdec0d58-32b5-4de0-b759-7d842b330bcd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "plt.plot(bins[:-1], fracs_gen, marker=\"o\")\n",
+    "plt.plot(bins[:-1], fracs_cand, marker=\"o\")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.show()"
+   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "689c787c-a5ab-4ddc-bd82-8e64392dbe25",
+   "id": "86838aaa-04ae-4e1e-9ba0-ab28164ea947",
    "metadata": {},
    "outputs": [],
    "source": []
diff --git a/parameters/pytorch/pyg-clic.yaml b/parameters/pytorch/pyg-clic.yaml
index f8bb73d1b..dd16021c4 100644
--- a/parameters/pytorch/pyg-clic.yaml
+++ b/parameters/pytorch/pyg-clic.yaml
@@ -10,7 +10,7 @@ num_epochs: 100
 patience: 20
 lr: 0.0001
 lr_schedule: cosinedecay  # constant, cosinedecay, onecycle
-conv_type: gnn_lsh  # gnn_lsh, attention, mamba, flashattention
+conv_type: attention  # gnn_lsh, attention, mamba, flashattention
 ntrain:
 ntest:
 nvalid:
@@ -35,12 +35,12 @@ model:
 
   gnn_lsh:
     conv_type: gnn_lsh
-    embedding_dim: 256
-    width: 256
-    num_convs: 6
+    embedding_dim: 512
+    width: 512
+    num_convs: 8
     activation: "elu"
     # gnn-lsh specific parameters
-    bin_size: 256
+    bin_size: 32
     max_num_bins: 200
     distance_dim: 128
     layernorm: True
@@ -50,16 +50,16 @@ model:
 
   attention:
     conv_type: attention
-    num_convs: 6
-    dropout_ff: 0.0
+    num_convs: 12
+    dropout_ff: 0.1
     dropout_conv_id_mha: 0.0
     dropout_conv_id_ff: 0.0
     dropout_conv_reg_mha: 0.0
     dropout_conv_reg_ff: 0.0
     activation: "relu"
-    head_dim: 16
+    head_dim: 32
     num_heads: 32
-    attention_type: flash
+    attention_type: math
     use_pre_layernorm: True
 
   mamba:
@@ -106,9 +106,9 @@ train_dataset:
       batch_size: 1
       samples:
         clic_edm_ttbar_pf:
-          version: 2.0.0
+          version: 2.1.0
         clic_edm_qq_pf:
-          version: 2.0.0
+          version: 2.1.0
 
 valid_dataset:
   clic:
@@ -116,12 +116,12 @@ valid_dataset:
       batch_size: 1
       samples:
         clic_edm_ttbar_pf:
-          version: 2.0.0
+          version: 2.1.0
         clic_edm_qq_pf:
-          version: 2.0.0
+          version: 2.1.0
 
 test_dataset:
   clic_edm_ttbar_pf:
-    version: 2.0.0
+    version: 2.1.0
   clic_edm_qq_pf:
-    version: 2.0.0
+    version: 2.1.0
diff --git a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml
index 9504ce378..cfacab525 100644
--- a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml
+++ b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml
@@ -108,7 +108,7 @@ train_dataset:
       batch_size: 10
       samples:
         cms_pf_ttbar_nopu:
-          version: 2.0.0
+          version: 2.2.0
 
 valid_dataset:
   cms:
@@ -116,8 +116,8 @@ valid_dataset:
       batch_size: 10
       samples:
         cms_pf_ttbar_nopu:
-          version: 2.0.0
+          version: 2.2.0
 
 test_dataset:
   cms_pf_ttbar_nopu:
-    version: 2.0.0
+    version: 2.2.0
diff --git a/parameters/pytorch/pyg-cms.yaml b/parameters/pytorch/pyg-cms.yaml
index 183a5f52a..68e160693 100644
--- a/parameters/pytorch/pyg-cms.yaml
+++ b/parameters/pytorch/pyg-cms.yaml
@@ -8,7 +8,7 @@ gpu_batch_multiplier: 1
 load:
 num_epochs: 100
 patience: 20
-lr: 0.0002
+lr: 0.0001
 lr_schedule: cosinedecay  # constant, cosinedecay, onecycle
 conv_type: attention
 ntrain:
@@ -107,22 +107,16 @@ train_dataset:
     physical_pu:
       batch_size: 1
       samples:
-        cms_pf_qcd:
-          version: 2.0.0
         cms_pf_ttbar:
-          version: 2.0.0
+          version: 2.2.0
 
 valid_dataset:
   cms:
     physical_pu:
       batch_size: 1
       samples:
-        cms_pf_qcd:
-          version: 2.0.0
         cms_pf_ttbar:
-          version: 2.0.0
+          version: 2.2.0
 test_dataset:
   cms_pf_ttbar:
-    version: 2.0.0
-  cms_pf_qcd:
-    version: 2.0.0
+    version: 2.2.0
diff --git a/scripts/generate_tfds.sh b/scripts/generate_tfds.sh
index 2aab3dda6..d8533b658 100755
--- a/scripts/generate_tfds.sh
+++ b/scripts/generate_tfds.sh
@@ -4,7 +4,7 @@ export KERAS_BACKEND=tensorflow
 export PYTHONPATH="mlpf:$PYTHONPATH"
 
 # T2_EE_Estonia
-export IMG=/home/software/singularity/pytorch.simg:2024-07-08
+export IMG=/home/software/singularity/pytorch.simg:2024-08-18
 export CMD="singularity exec -B /local -B /scratch/persistent $IMG tfds build "
 
 # Desktop
@@ -14,9 +14,9 @@ export CMD="singularity exec -B /local -B /scratch/persistent $IMG tfds build "
 # export CMD="singularity exec -B /media/joosep/data --env PYTHONPATH=$PYTHONPATH $IMG tfds build "
 
 # CMS
-# export DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets
-# export MANUAL_DIR=/local/joosep/mlpf/cms/20240702_cptruthdef
-# $CMD mlpf/heptfds/cms_pf/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_ttbar.log &
+export DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets
+export MANUAL_DIR=/local/joosep/mlpf/cms/20240823_simcluster
+$CMD mlpf/heptfds/cms_pf/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_ttbar.log &
 # $CMD mlpf/heptfds/cms_pf/qcd --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_qcd.log &
 # $CMD mlpf/heptfds/cms_pf/ztt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_ztt.log &
 # $CMD mlpf/heptfds/cms_pf/qcd_high_pt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_qcd_high_pt.log &
@@ -34,7 +34,7 @@ export CMD="singularity exec -B /local -B /scratch/persistent $IMG tfds build "
 # $CMD mlpf/heptfds/cms_pf/ttbar_nopu --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_ttbar_nopu.log &
 # $CMD mlpf/heptfds/cms_pf/qcd_nopu --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_qcd_nopu.log &
 # $CMD mlpf/heptfds/cms_pf/vbf_nopu --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_vbf_nopu.log &
-# wait
+wait
 
 # CLIC cluster-based
 # export DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets
diff --git a/scripts/local_test_pyg.sh b/scripts/local_test_pyg.sh
index 00fd3cbd0..6bbc0042b 100755
--- a/scripts/local_test_pyg.sh
+++ b/scripts/local_test_pyg.sh
@@ -9,8 +9,8 @@ mkdir -p local_test_data/TTbar_14TeV_TuneCUETP8M1_cfi/root
 cd local_test_data/TTbar_14TeV_TuneCUETP8M1_cfi/root
 
 #Only CMS-internal use is permitted by CMS rules! Do not use these MC simulation files otherwise!
-wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/20240702_cptruthdef/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_100000.root
-wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/20240702_cptruthdef/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_100001.root
+wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/20240823_simcluster/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_100000.root
+wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/20240823_simcluster/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_100001.root
 
 cd ../../..
 
diff --git a/scripts/tallinn/a100/pytorch-clic.sh b/scripts/tallinn/a100/pytorch-clic.sh
new file mode 100755
index 000000000..3d6e59a4f
--- /dev/null
+++ b/scripts/tallinn/a100/pytorch-clic.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+#SBATCH --partition gpu
+#SBATCH --gres gpu:a100:1
+#SBATCH --mem-per-gpu 50G
+#SBATCH -o logs/slurm-%x-%j-%N.out
+
+IMG=/home/software/singularity/pytorch.simg:2024-08-18
+cd ~/particleflow
+
+ulimit -n 10000
+singularity exec -B /scratch/persistent --nv \
+    --env PYTHONPATH=hep_tfds \
+    --env KERAS_BACKEND=torch \
+    $IMG python3 mlpf/pyg_pipeline.py --dataset clic --gpus 1 \
+    --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-clic.yaml \
+    --train --conv-type attention --num-epochs 100 --gpu-batch-multiplier 256 --num-workers 4 --prefetch-factor 100 --checkpoint-freq 1 --comet --dtype bfloat16
diff --git a/scripts/tallinn/a100/pytorch-finetune.sh b/scripts/tallinn/a100/pytorch-finetune.sh
new file mode 100644
index 000000000..9e39580c8
--- /dev/null
+++ b/scripts/tallinn/a100/pytorch-finetune.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+#SBATCH --partition gpu
+#SBATCH --gres gpu:mig:1
+#SBATCH --mem-per-gpu 60G
+#SBATCH -o logs/slurm-%x-%j-%N.out
+
+IMG=/home/software/singularity/pytorch.simg:2024-07-08
+cd ~/particleflow
+
+env
+
+singularity exec -B /scratch/persistent --nv \
+    --env PYTHONPATH=hep_tfds \
+    --env KERAS_BACKEND=torch \
+    $IMG python3.10 mlpf/pyg_pipeline.py --dataset cms --gpus 1 \
+    --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms-finetune.yaml \
+    --train --test --make-plots --conv-type attention --attention-type flash --gpu-batch-multiplier 4 --num-workers 1 --prefetch-factor 50 --dtype bfloat16 --checkpoint-freq 1 --load experiments/pyg-cms_20240804_095032_809397_finetune/checkpoints/checkpoint-13-19.827532.pth --num-epochs 50
diff --git a/scripts/tallinn/a100/pytorch-small-eval-clic.sh b/scripts/tallinn/a100/pytorch-small-eval-clic.sh
old mode 100644
new mode 100755
index a10df0bda..a8a79ad82
--- a/scripts/tallinn/a100/pytorch-small-eval-clic.sh
+++ b/scripts/tallinn/a100/pytorch-small-eval-clic.sh
@@ -1,16 +1,16 @@
 #!/bin/bash
 #SBATCH --partition gpu
 #SBATCH --gres gpu:mig:1
-#SBATCH --mem-per-gpu 60G
+#SBATCH --mem-per-gpu 50G
 #SBATCH -o logs/slurm-%x-%j-%N.out
 
-IMG=/home/software/singularity/pytorch.simg:2024-08-02
+IMG=/home/software/singularity/pytorch.simg:2024-08-18
 cd ~/particleflow
 
-WEIGHTS=experiments/pyg-clic_20240807_134034_168101/checkpoints/checkpoint-100-9.413720.pth
+WEIGHTS=experiments/pyg-clic_20240830_114129_279460/checkpoints/checkpoint-11-8.095037.pth
 singularity exec -B /scratch/persistent --nv \
      --env PYTHONPATH=hep_tfds \
      --env KERAS_BACKEND=torch \
      $IMG  python3 mlpf/pyg_pipeline.py --dataset clic --gpus 1 \
      --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-clic.yaml \
-     --test --make-plots --gpu-batch-multiplier 200 --load $WEIGHTS --dtype bfloat16
+     --test --make-plots --gpu-batch-multiplier 100 --load $WEIGHTS --dtype bfloat16 --ntest 10000
diff --git a/scripts/tallinn/a100/pytorch-small-eval-cms.sh b/scripts/tallinn/a100/pytorch-small-eval-cms.sh
old mode 100644
new mode 100755
index b85fa642f..b91a923d3
--- a/scripts/tallinn/a100/pytorch-small-eval-cms.sh
+++ b/scripts/tallinn/a100/pytorch-small-eval-cms.sh
@@ -4,13 +4,13 @@
 #SBATCH --mem-per-gpu 50G
 #SBATCH -o logs/slurm-%x-%j-%N.out
 
-IMG=/home/software/singularity/pytorch.simg:2024-08-02
+IMG=/home/software/singularity/pytorch.simg:2024-08-18
 cd ~/particleflow
 
-WEIGHTS=experiments/pyg-cms-ttbar-nopu_20240815_233931_332621/checkpoints/checkpoint-17-17.282402.pth
+WEIGHTS=experiments/pyg-cms_20240901_194940_868487/checkpoints/checkpoint-02-18.710537.pth
 singularity exec -B /scratch/persistent --nv \
      --env PYTHONPATH=hep_tfds \
      --env KERAS_BACKEND=torch \
      $IMG  python mlpf/pyg_pipeline.py --dataset cms --gpus 1 \
-     --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms-ttbar-nopu.yaml \
-     --test --make-plots --gpu-batch-multiplier 5 --load $WEIGHTS --ntest 10000 --dtype bfloat16
+     --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \
+     --test --make-plots --gpu-batch-multiplier 10 --load $WEIGHTS --ntest 5000 --dtype bfloat16
diff --git a/scripts/tallinn/a100/pytorch.sh b/scripts/tallinn/a100/pytorch.sh
index db91084de..8a0535da2 100755
--- a/scripts/tallinn/a100/pytorch.sh
+++ b/scripts/tallinn/a100/pytorch.sh
@@ -1,10 +1,10 @@
 #!/bin/bash
 #SBATCH --partition gpu
 #SBATCH --gres gpu:a100:1
-#SBATCH --mem-per-gpu 200G
+#SBATCH --mem-per-gpu 100G
 #SBATCH -o logs/slurm-%x-%j-%N.out
 
-IMG=/home/software/singularity/pytorch.simg:2024-08-02
+IMG=/home/software/singularity/pytorch.simg:2024-08-18
 cd ~/particleflow
 
 ulimit -n 10000
@@ -12,5 +12,6 @@ singularity exec -B /scratch/persistent --nv \
     --env PYTHONPATH=hep_tfds \
     --env KERAS_BACKEND=torch \
     $IMG python3 mlpf/pyg_pipeline.py --dataset cms --gpus 1 \
-    --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms-ttbar-nopu.yaml \
-    --train --test --make-plots --num-epochs 20 --conv-type attention --gpu-batch-multiplier 10 --num-workers 4 --prefetch-factor 100 --checkpoint-freq 1 --comet --ntrain 50000 --nvalid 5000 --ntest 5000
+    --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \
+    --train --test --make-plots --num-epochs 100 --conv-type attention \
+    --gpu-batch-multiplier 10 --num-workers 4 --prefetch-factor 100 --checkpoint-freq 1 --comet
diff --git a/scripts/tallinn/cmssw-el8.sh b/scripts/tallinn/cmssw-el8.sh
index f2d0f1766..c77d6ac75 100755
--- a/scripts/tallinn/cmssw-el8.sh
+++ b/scripts/tallinn/cmssw-el8.sh
@@ -6,4 +6,4 @@
 
 source /cvmfs/cms.cern.ch/cmsset_default.sh
 export UNPACKED_IMAGE=/cvmfs/singularity.opensciencegrid.org/cmssw/cms\:rhel8-x86_64/
-cmssw-el8 -B /cms -B /local -B /scratch/persistent -B /scratch/local --command-to-run $@
+cmssw-el8 -B /root -B /cms -B /local -B /scratch/persistent -B /scratch/local --command-to-run $@
diff --git a/scripts/tallinn/rtx/clic.sh b/scripts/tallinn/rtx/clic.sh
index 024936422..9bb08dc26 100755
--- a/scripts/tallinn/rtx/clic.sh
+++ b/scripts/tallinn/rtx/clic.sh
@@ -1,16 +1,16 @@
 #!/bin/bash
 #SBATCH --partition gpu
-#SBATCH --gres gpu:rtx:4
+#SBATCH --gres gpu:rtx:8
 #SBATCH --mem-per-gpu 40G
 #SBATCH -o logs/slurm-%x-%j-%N.out
 
-IMG=/home/software/singularity/tf-2.14.0.simg
+IMG=/home/software/singularity/pytorch.simg:2024-08-02
 cd ~/particleflow
 
-#TF training
+ulimit -n 10000
 singularity exec -B /scratch/persistent --nv \
     --env PYTHONPATH=hep_tfds \
-    --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \
-    $IMG python3.10 mlpf/pipeline.py train -c parameters/tensorflow/clic.yaml \
-    --plot-freq 1 \
-    --batch-multiplier 0.5
+    --env KERAS_BACKEND=torch \
+    $IMG python3 mlpf/pyg_pipeline.py --dataset clic --gpus 8 \
+    --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-clic.yaml \
+    --train --conv-type attention --attention-type math --num-epochs 200 --gpu-batch-multiplier 32 --num-workers 4 --prefetch-factor 100 --checkpoint-freq 1 --comet
diff --git a/scripts/tallinn/rtx/pytorch.sh b/scripts/tallinn/rtx/pytorch.sh
index b029fa7c4..0ce0280f5 100755
--- a/scripts/tallinn/rtx/pytorch.sh
+++ b/scripts/tallinn/rtx/pytorch.sh
@@ -4,11 +4,12 @@
 #SBATCH --mem-per-gpu 40G
 #SBATCH -o logs/slurm-%x-%j-%N.out
 
-IMG=/home/software/singularity/pytorch.simg:2024-07-03
+IMG=/home/software/singularity/pytorch.simg:2024-08-18
 
+ulimit -n 10000
 singularity exec -B /scratch/persistent --nv \
     --env PYTHONPATH=hep_tfds \
     --env KERAS_BACKEND=torch \
-    $IMG python3.10 mlpf/pyg_pipeline.py --dataset clic --gpus 4 \
+    $IMG python3 mlpf/pyg_pipeline.py --dataset clic --gpus 4 \
     --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-clic.yaml \
-    --train --test --make-plots --conv-type attention --gpu-batch-multiplier 10 --num-workers 1 --prefetch-factor 10 --attention-type math --dtype float32
+    --train --test --make-plots --conv-type attention --gpu-batch-multiplier 10 --num-workers 2 --prefetch-factor 100 --attention-type math --dtype float32