diff --git a/mlpf/data_cms/genjob_nopu.sh b/mlpf/data_cms/genjob_nopu.sh index a60c4bda3..c19077759 100755 --- a/mlpf/data_cms/genjob_nopu.sh +++ b/mlpf/data_cms/genjob_nopu.sh @@ -6,7 +6,7 @@ set -e set -x -OUTDIR=/local/joosep/mlpf/cms/20240702_cptruthdef/nopu/ +OUTDIR=/local/joosep/mlpf/cms/20240823_simcluster/nopu/ CMSSWDIR=/scratch/persistent/joosep/CMSSW_14_1_0_pre3 MLPF_PATH=/home/joosep/particleflow/ @@ -22,7 +22,7 @@ mkdir -p $OUTDIR PILEUP=NoPileUp PILEUP_INPUT= -N=200 +N=100 env source /cvmfs/cms.cern.ch/cmsset_default.sh @@ -73,8 +73,8 @@ ls -lrt echo "process.RandomNumberGeneratorService.generator.initialSeed = $SEED" >> step2_phase1_new.py cmsRun step2_phase1_new.py > /dev/null cmsRun step3_phase1_new.py > /dev/null -#cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py mv pfntuple.root pfntuple_${SEED}.root + python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ bzip2 -z pfntuple_${SEED}.pkl cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/ @@ -82,6 +82,6 @@ cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/ #copy ROOT outputs #cp step2_phase1_new.root $OUTDIR/$SAMPLE/root/step2_${SEED}.root #cp step3_phase1_new.root $OUTDIR/$SAMPLE/root/step3_${SEED}.root -#cp pfntuple_${SEED}.root $OUTDIR/$SAMPLE/root/ +cp pfntuple_${SEED}.root $OUTDIR/$SAMPLE/root/ rm -Rf /scratch/local/joosep/$SLURM_JOBID diff --git a/mlpf/data_cms/genjob_pu55to75.sh b/mlpf/data_cms/genjob_pu55to75.sh index cd5763e41..54ed7c166 100755 --- a/mlpf/data_cms/genjob_pu55to75.sh +++ b/mlpf/data_cms/genjob_pu55to75.sh @@ -7,7 +7,7 @@ set -e set -x -OUTDIR=/local/joosep/mlpf/cms/20240702_cptruthdef/pu55to75/ +OUTDIR=/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/ CMSSWDIR=/scratch/persistent/joosep/CMSSW_14_1_0_pre3 MLPF_PATH=/home/joosep/particleflow/ @@ -77,13 +77,13 @@ cmsRun step2_phase1_new.py > /dev/null cmsRun step3_phase1_new.py > /dev/null #cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py mv pfntuple.root pfntuple_${SEED}.root -python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ -bzip2 -z pfntuple_${SEED}.pkl -cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/ +# python3 ${MLPF_PATH}/mlpf/data_cms/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ +# bzip2 -z pfntuple_${SEED}.pkl +# cp *.pkl.bz2 $OUTDIR/$SAMPLE/raw/ #copy ROOT outputs #cp step2_phase1_new.root $OUTDIR/$SAMPLE/root/step2_${SEED}.root #cp step3_phase1_new.root $OUTDIR/$SAMPLE/root/step3_${SEED}.root -#cp pfntuple_${SEED}.root $OUTDIR/$SAMPLE/root/ +cp pfntuple_${SEED}.root $OUTDIR/$SAMPLE/root/ rm -Rf /scratch/local/joosep/$SLURM_JOBID diff --git a/mlpf/data_cms/postprocessing2.py b/mlpf/data_cms/postprocessing2.py index 1a36e4fac..6d1c9e3e6 100644 --- a/mlpf/data_cms/postprocessing2.py +++ b/mlpf/data_cms/postprocessing2.py @@ -2,16 +2,13 @@ import os import pickle -import matplotlib -import matplotlib.pyplot as plt import networkx as nx import numpy as np import tqdm import uproot import vector import awkward - -matplotlib.use("Agg") +from sklearn.neighbors import KDTree elem_branches = [ "typ", @@ -70,7 +67,7 @@ "phierror4", ] -target_branches = ["typ", "charge", "pt", "eta", "sin_phi", "cos_phi", "e", "ispu", "orig_pid"] +target_branches = ["pid", "charge", "pt", "eta", "sin_phi", "cos_phi", "e", "ispu"] def print_gen(g, min_pt=1): @@ -87,9 +84,19 @@ def print_gen(g, min_pt=1): if g.nodes[node]["pt"] > min_pt: print(node, g.nodes[node]["pt"], g.nodes[node]["eta"], g.nodes[node]["phi"], g.nodes[node]["typ"]) - gen_nodes = [n for n in g.nodes if n[0] == "cp" and g.nodes[n]["pt"] > min_pt] - for node in gen_nodes: - children = [(g.nodes[suc]["typ"], g.edges[node, suc]["weight"]) for suc in g.successors(node)] + # sc_nodes = [n for n in g.nodes if n[0] == "sc"] + # for node in sc_nodes: + # print(node, g.nodes[node]["pt"], g.nodes[node]["eta"], g.nodes[node]["phi"], g.nodes[node]["pid"]) + # children = list(g.successors(node)) + # int_e = 0.0 + # for ch in children: + # print(" ", ch, g.edges[node, ch]["weight"]) + # int_e += g.edges[node, ch]["weight"] + # g.nodes[node]["interacting"] = int_e + + cp_nodes = [n for n in g.nodes if n[0] == "cp"] + for node in cp_nodes: + children = list(g.successors(node)) print(node, g.nodes[node]["pt"], g.nodes[node]["eta"], g.nodes[node]["phi"], g.nodes[node]["pid"], children) @@ -133,63 +140,6 @@ def get_charge(pid): raise Exception("Unknown pid: ", pid) -def draw_event(g): - pos = {} - for node in g.nodes: - pos[node] = (g.nodes[node]["eta"], g.nodes[node]["phi"]) - - fig = plt.figure(figsize=(10, 10)) - - nodes_to_draw = [n for n in g.nodes if n[0] == "elem"] - nx.draw_networkx( - g, - pos=pos, - with_labels=False, - node_size=5, - nodelist=nodes_to_draw, - edgelist=[], - node_color="red", - node_shape="s", - alpha=0.5, - ) - - nodes_to_draw = [n for n in g.nodes if n[0] == "pfcand"] - nx.draw_networkx( - g, - pos=pos, - with_labels=False, - node_size=10, - nodelist=nodes_to_draw, - edgelist=[], - node_color="green", - node_shape="x", - alpha=0.5, - ) - - nodes_to_draw = [n for n in g.nodes if (n[0] == "cp")] - nx.draw_networkx( - g, - pos=pos, - with_labels=False, - node_size=1, - nodelist=nodes_to_draw, - edgelist=[], - node_color="blue", - node_shape=".", - alpha=0.5, - ) - - # draw edges between genparticles and elements - edges_to_draw = [e for e in g.edges if e[0] in nodes_to_draw] - nx.draw_networkx_edges(g, pos, edgelist=edges_to_draw, arrows=False, alpha=0.1) - - plt.xlim(-6, 6) - plt.ylim(-4, 4) - plt.tight_layout() - plt.axis("on") - return fig - - def compute_gen_met(g): genpart = [elem for elem in g.nodes if elem[0] == "cp"] px = np.sum([g.nodes[elem]["pt"] * np.cos(g.nodes[elem]["phi"]) for elem in genpart]) @@ -198,178 +148,65 @@ def compute_gen_met(g): return met -def merge_closeby_particles(g, deltar_cut=0.01, max_iter=100): - print("merging closeby met={:.2f}".format(compute_gen_met(g))) - - for it in range(max_iter): - particles_to_merge = [elem for elem in g.nodes if elem[0] == "cp"] - part_eta = [g.nodes[node]["eta"] for node in particles_to_merge] - part_phi = [g.nodes[node]["phi"] for node in particles_to_merge] - - # find pairs that are close by in deltaR - # note that if there are >2 particles close by to each other, only the closest 2 get merged - merge_pairs = [] - pairs_0, pairs_1 = deltar_pairs(part_eta, part_phi, deltar_cut) - - # no closeby particles, break - if len(pairs_0) == 0: - break - merge_pairs = [(particles_to_merge[p0], particles_to_merge[p1]) for p0, p1 in zip(pairs_0, pairs_1)] - - print("merging {} pairs".format(len(merge_pairs))) - for pair in merge_pairs: - if pair[0] in g.nodes and pair[1] in g.nodes: - lv = vector.obj(pt=0, eta=0, phi=0, E=0) - sum_pu = 0.0 - sum_tot = 0.0 - for gp in pair: - lv += vector.obj( - pt=g.nodes[gp]["pt"], - eta=g.nodes[gp]["eta"], - phi=g.nodes[gp]["phi"], - E=g.nodes[gp]["e"], - ) - sum_pu += g.nodes[gp]["ispu"] * g.nodes[gp]["e"] - sum_tot += g.nodes[gp]["e"] - - # now update the remaining particle properties - g.nodes[pair[0]]["pt"] = lv.pt - g.nodes[pair[0]]["eta"] = lv.eta - g.nodes[pair[0]]["phi"] = lv.phi - g.nodes[pair[0]]["e"] = lv.energy - g.nodes[pair[0]]["ispu"] = sum_pu / sum_tot - orig_pid = g.nodes[pair[0]]["pid"] - if g.nodes[pair[1]]["e"] > g.nodes[pair[0]]["e"]: - orig_pid = g.nodes[pair[1]]["pid"] - g.nodes[pair[0]]["pid"] = orig_pid - - # add edge weights from the deleted particle to the remaining particle - for suc in g.successors(pair[1]): - if (pair[0], suc) in g.edges: - g.edges[(pair[0], suc)]["weight"] += g.edges[(pair[1], suc)]["weight"] - g.remove_nodes_from([pair[1]]) - print("done merging, met={:.2f}".format(compute_gen_met(g))) - - -def cleanup_graph(g, node_energy_threshold=0.1, edge_energy_threshold=0.05): - g = g.copy() - - print("start cleanup, met={:.2f}".format(compute_gen_met(g))) - - # For each truth particle, compute the energy in tracks or calorimeter clusters - for node in g.nodes: - - # CaloParticles or TrackingParticles - if node[0] == "cp": - E_track = 0.0 - E_calo = 0.0 - E_other = 0.0 - E_hf = 0.0 - E_hfem = 0.0 - E_hfhad = 0.0 - - # remap PID to PF-like - g.nodes[node]["remap_pid"] = map_pdgid_to_candid(abs(g.nodes[node]["pid"]), g.nodes[node]["charge"]) - - for suc in g.successors(node): - elem_type = g.nodes[suc]["typ"] - if elem_type in [1, 6]: - E_track += g.edges[node, suc]["weight"] - elif elem_type in [4, 5, 10, 11]: - E_calo += g.edges[node, suc]["weight"] - elif elem_type in [8, 9]: - if elem_type == 8: - E_hfem += g.edges[node, suc]["weight"] - elif elem_type == 9: - E_hfhad += g.edges[node, suc]["weight"] - E_hf += g.edges[node, suc]["weight"] - else: - E_other += g.edges[node, suc]["weight"] - - g.nodes[node]["E_track"] = E_track - g.nodes[node]["E_calo"] = E_calo - g.nodes[node]["E_other"] = E_other - g.nodes[node]["E_hf"] = E_hf - g.nodes[node]["E_hfem"] = E_hfem - g.nodes[node]["E_hfhad"] = E_hfhad - - # If there are multiple tracks matched to a gen/sim particle, keep the association to the closest one by dR - for node in g.nodes: - if node[0] == "cp": - # collect tracks or GSFs - tracks = [] - for suc in g.successors(node): - typ = g.nodes[suc]["typ"] - if typ == 1 or typ == 6: - tracks.append(suc) - if len(tracks) > 1: - n0 = g.nodes[node] - drs = [] - for tr in tracks: - n1 = g.nodes[tr] - deta = np.abs(n0["eta"] - n1["eta"]) - dphi = np.mod(n0["phi"] - n1["phi"] + np.pi, 2 * np.pi) - np.pi - dr2 = deta**2 + dphi**2 - drs.append(dr2) - imin = np.argmin(drs) - - # set the weight of the edge to the other tracks to 0 - for itr in range(len(tracks)): - if itr != imin: - g.edges[(node, tracks[itr])]["weight"] = 0.0 +def split_caloparticles(g, elem_type): - for node in g.nodes: - if node[0] == "cp": - remap_pid = g.nodes[node]["remap_pid"] - - # charged particles that leave no track should not be reconstructed as charged - if remap_pid in [211, 13] and g.nodes[node]["E_track"] == 0: - g.nodes[node]["remap_pid"] = 130 - g.nodes[node]["charge"] = 0 - if remap_pid in [11] and g.nodes[node]["E_track"] == 0: - g.nodes[node]["remap_pid"] = 22 - g.nodes[node]["charge"] = 0 - - # if a particle only leaves deposits in the HF, it should be reconstructed as an HF candidate - if (g.nodes[node]["E_track"] == 0) and (g.nodes[node]["E_calo"] == 0) and (g.nodes[node]["E_other"] == 0) and g.nodes[node]["E_hf"] > 0: - if g.nodes[node]["E_hfhad"] > g.nodes[node]["E_hfem"]: - g.nodes[node]["remap_pid"] = 1 - g.nodes[node]["charge"] = 0 - else: - g.nodes[node]["remap_pid"] = 2 - g.nodes[node]["charge"] = 0 - - # CaloParticles contain a lot of electrons and muons with a soft pt spectrum - # these should not be attempted to be reconstructed as ele/mu, but rather as charged or neutral hadrons - for node in g.nodes: - if node[0] == "cp": - nd = g.nodes[node] - if nd["pt"] < 1.0 and (abs(nd["remap_pid"]) == 11 or abs(nd["remap_pid"]) == 13): - if g.nodes[node]["E_track"] > g.nodes[node]["E_calo"]: - g.nodes[node]["remap_pid"] = 211 - else: - if abs(nd["remap_pid"]) == 11: - g.nodes[node]["remap_pid"] = 22 - else: - g.nodes[node]["remap_pid"] = 130 - g.nodes[node]["charge"] = 0 - - # remove calopart/trackingpart not linked to any elements - # as these are not reconstructable in principle - nodes_to_remove = [] - for node in g.nodes: - if node[0] == "cp": - deg = g.degree[node] - if deg == 0: - nodes_to_remove += [node] - g.remove_nodes_from(nodes_to_remove) - print("unlinked cleanup, met={:.2f}".format(compute_gen_met(g))) + # loop over caloparticles + cps = [(g.nodes[n]["pt"], n) for n in g.nodes if n[0] == "cp"] + for _, cp in cps: - return g + # get all associated elements with type==elem_type that received a contribution from this caloparticle + sucs = [(suc, g.edges[cp, suc]["weight"], g.nodes[suc]["e"]) for suc in g.successors(cp) if g.nodes[suc]["typ"] == elem_type] + sum_sucs_w = sum([s[1] for s in sucs]) + sucs = [s[0] for s in sucs] + if len(sucs) > 1: + lv = vector.obj( + pt=g.nodes[cp]["pt"], + eta=g.nodes[cp]["eta"], + phi=g.nodes[cp]["phi"], + e=g.nodes[cp]["e"], + ) + lv_fracs = [] + for suc in sucs: + frac_e = g.edges[cp, suc]["weight"] / sum_sucs_w + lv_fracs.append(lv * frac_e) + + max_cp = max([n[1] for n in g.nodes if n[0] == "cp"]) + 1 + new_cp_index = max_cp + for lv_frac, suc in zip(lv_fracs, sucs): + g.add_node( + ("cp", new_cp_index), + pt=lv_frac.pt, + eta=lv_frac.eta, + phi=lv_frac.phi, + e=lv_frac.e, + charge=g.nodes[cp]["charge"], + pid=g.nodes[cp]["pid"], + ispu=g.nodes[cp]["ispu"], + ) + g.add_edge(("cp", new_cp_index), suc, weight=g.edges[cp, suc]["weight"]) + new_cp_index += 1 + g.remove_node(cp) + + +def find_representative_elements(g, elem_to_gp, gp_to_elem, elem_type): + unused_elems = [] + elems = [(g.nodes[e]["pt"], e) for e in g.nodes if e[0] == "elem" and g.nodes[e]["typ"] == elem_type] + elems_sorted = sorted(elems, key=lambda x: x[0], reverse=True) + for _, elem in elems_sorted: + gps = list(g.predecessors(elem)) + gps_weight = [(g.edges[(gp, elem)]["weight"], gp) for gp in gps if gp not in gp_to_elem if gp[0] == "cp"] + gps_weight_sorted = sorted(gps_weight, key=lambda x: x[0], reverse=True) + if len(gps_weight_sorted) > 0: + gp = gps_weight_sorted[0][1] + elem_to_gp[elem] = gp + gp_to_elem[gp] = elem + else: + unused_elems.append(elem) -def prepare_normalized_table(g, genparticle_energy_threshold=0.2): - # rg = g.reverse() +def prepare_normalized_table(g, iev): + split_caloparticles(g, 1) + # pickle.dump(g, open("split_g_{}.pkl".format(iev), "wb"), pickle.HIGHEST_PROTOCOL) all_genparticles = [] all_elements = [] @@ -385,33 +222,34 @@ def prepare_normalized_table(g, genparticle_energy_threshold=0.2): all_genparticles = list(set(all_genparticles)) all_elements = sorted(all_elements) - # assign genparticles in reverse energy order uniquely to best element elem_to_gp = {} # map of element -> genparticles - unmatched_gp = [] - for gp in sorted(all_genparticles, key=lambda x: g.nodes[x]["e"], reverse=True): - elems = [e for e in g.successors(gp)] - - # sort elements by energy deposit from genparticle - elems_sorted = sorted( - [(g.edges[gp, e]["weight"], e) for e in elems], - key=lambda x: x[0], - reverse=True, - ) - - chosen_elem = None - for weight, elem in elems_sorted: - if not (elem in elem_to_gp): - chosen_elem = elem - elem_to_gp[elem] = [] - break - - if chosen_elem is None: - unmatched_gp += [gp] - else: - elem_to_gp[elem] += [gp] + gp_to_elem = {} # map of genparticle -> element + + # assign genparticles in reverse pt order uniquely to best element + find_representative_elements(g, elem_to_gp, gp_to_elem, 1) + find_representative_elements(g, elem_to_gp, gp_to_elem, 6) + find_representative_elements(g, elem_to_gp, gp_to_elem, 4) + find_representative_elements(g, elem_to_gp, gp_to_elem, 5) + find_representative_elements(g, elem_to_gp, gp_to_elem, 8) + find_representative_elements(g, elem_to_gp, gp_to_elem, 9) + find_representative_elements(g, elem_to_gp, gp_to_elem, 10) + find_representative_elements(g, elem_to_gp, gp_to_elem, 11) + + s1 = set(list(gp_to_elem.keys())) + s2 = set(all_genparticles) + unmatched_gp = list(s2 - s1) + # for gp in unmatched_gp: + # print(gp, g.nodes[gp]["pid"], g.nodes[gp]["pt"]) + + # s1 = set(list(elem_to_gp.keys())) + # s2 = set(all_elements) + # unmatched_elem = list(s2 - s1) + # for elem in unmatched_elem: + # print(elem, g.nodes[elem]["typ"], g.nodes[elem]["pt"]) # assign unmatched genparticles to best element, allowing for overlaps - for gp in sorted(unmatched_gp, key=lambda x: g.nodes[x]["e"], reverse=True): + elem_to_gp = {k: [v] for k, v in elem_to_gp.items()} + for gp in sorted(unmatched_gp, key=lambda x: g.nodes[x]["pt"], reverse=True): elems = [e for e in g.successors(gp)] elems_sorted = sorted( [(g.edges[gp, e]["weight"], e) for e in elems], @@ -421,12 +259,11 @@ def prepare_normalized_table(g, genparticle_energy_threshold=0.2): _, elem = elems_sorted[0] elem_to_gp[elem] += [gp] + # Find primary element for each PFCandidate unmatched_cand = [] elem_to_cand = {} - - # Find primary element for each PFCandidate - for cand in sorted(all_pfcandidates, key=lambda x: g.nodes[x]["e"], reverse=True): - tp = g.nodes[cand]["typ"] + for cand in sorted(all_pfcandidates, key=lambda x: g.nodes[x]["pt"], reverse=True): + tp = g.nodes[cand]["pid"] neighbors = list(g.predecessors(cand)) chosen_elem = None @@ -479,13 +316,11 @@ def prepare_normalized_table(g, genparticle_energy_threshold=0.2): ycand.fill(0.0) for ielem, elem in enumerate(all_elements): - elem_type = g.nodes[elem]["typ"] genparticles = sorted( elem_to_gp.get(elem, []), key=lambda x: g.edges[(x, elem)]["weight"], reverse=True, ) - # genparticles = [gp for gp in genparticles if g.nodes[gp]["e"] > genparticle_energy_threshold] candidate = elem_to_cand.get(elem, None) for j in range(len(elem_branches)): @@ -499,13 +334,12 @@ def prepare_normalized_table(g, genparticle_energy_threshold=0.2): # if several CaloParticles/TrackingParticles are associated to ONLY this element, merge them, as they are not reconstructable separately if len(genparticles) > 0: + pids_e = sorted([(g.nodes[gp]["pid"], g.nodes[gp]["e"]) for gp in genparticles], key=lambda x: x[1], reverse=True) + # get the pid of the highest-energy particle associated with this element + pid = pids_e[0][0] - orig_pid = [(g.nodes[gp]["pid"], g.nodes[gp]["e"]) for gp in genparticles] - orig_pid = sorted(orig_pid, key=lambda x: x[1], reverse=True) - orig_pid = orig_pid[0][0] - - pid = g.nodes[genparticles[0]]["remap_pid"] charge = g.nodes[genparticles[0]]["charge"] + pid = map_pdgid_to_candid(pid, charge) sum_pu = 0.0 sum_tot = 0.0 @@ -519,42 +353,23 @@ def prepare_normalized_table(g, genparticle_energy_threshold=0.2): sum_pu += g.nodes[gp]["ispu"] * g.nodes[gp]["e"] sum_tot += g.nodes[gp]["e"] - # remap PID in case of HCAL cluster to neutral - if elem_type == 5 and (pid == 22 or pid == 11): - pid = 130 - - # remap forward region to HFHAD or HFEM - if elem_type in [8, 9]: - if pid == 130: - pid = 1 - elif pid == 22: - pid = 2 - - # Remap HF candidates to neutral hadron or photon in case not matched to HF - if elem_type in [2, 3, 4, 5]: - if pid == 1: - pid = 130 - elif pid == 2: - pid = 22 - gp = { "pt": lv.rho, "eta": lv.eta, "sin_phi": np.sin(lv.phi), "cos_phi": np.cos(lv.phi), "e": lv.t, - "typ": pid, - "orig_pid": orig_pid, + "pid": pid, "px": lv.x, "py": lv.y, "pz": lv.z, "ispu": sum_pu / sum_tot, - "charge": charge if pid in [211, 11, 13] else 0, + "charge": charge, } - # print(" mlpf: type={} E={:.2f} eta={:.2f} phi={:.2f} q={}".format(pid, lv.t, lv.eta, lv.phi, gp["charge"])) for j in range(len(target_branches)): ygen[target_branches[j]][ielem] = gp[target_branches[j]] + px = np.sum(ygen["pt"] * ygen["cos_phi"]) py = np.sum(ygen["pt"] * ygen["sin_phi"]) met = np.sqrt(px**2 + py**2) @@ -639,6 +454,14 @@ def make_graph(ev, iev): caloparticle_ev = ev["caloparticle_ev"][iev] caloparticle_idx_trackingparticle = ev["caloparticle_idx_trackingparticle"][iev] + simcluster_pid = ev["simcluster_pid"][iev] + simcluster_pt = ev["simcluster_pt"][iev] + simcluster_e = ev["simcluster_energy"][iev] + simcluster_eta = ev["simcluster_eta"][iev] + simcluster_phi = ev["simcluster_phi"][iev] + simcluster_idx_trackingparticle = ev["simcluster_idx_trackingparticle"][iev] + simcluster_idx_caloparticle = ev["simcluster_idx_caloparticle"][iev] + pfcandidate_pdgid = ev["pfcandidate_pdgid"][iev] pfcandidate_pt = ev["pfcandidate_pt"][iev] pfcandidate_e = ev["pfcandidate_energy"][iev] @@ -719,7 +542,7 @@ def make_graph(ev, iev): for iobj in range(len(gen_pdgid)): g.add_node( ("gen", iobj), - typ=gen_pdgid[iobj], + pid=abs(gen_pdgid[iobj]), pt=gen_pt[iobj], e=gen_e[iobj], eta=gen_eta[iobj], @@ -735,7 +558,7 @@ def make_graph(ev, iev): for iobj in range(len(trackingparticle_pid)): g.add_node( ("tp", iobj), - pid=trackingparticle_pid[iobj], + pid=abs(trackingparticle_pid[iobj]), charge=trackingparticle_charge[iobj], pt=trackingparticle_pt[iobj], e=trackingparticle_e[iobj], @@ -746,13 +569,9 @@ def make_graph(ev, iev): # CaloParticles for iobj in range(len(caloparticle_pid)): - if abs(caloparticle_pid[iobj]) == 15: - print( - "tau caloparticle pt={}, this will introduce fake MET due to inclusion of neutrino in the caloparticle".format(caloparticle_pt[iobj]) - ) g.add_node( ("cp", iobj), - pid=caloparticle_pid[iobj], + pid=abs(caloparticle_pid[iobj]), charge=caloparticle_charge[iobj], pt=caloparticle_pt[iobj], e=caloparticle_e[iobj], @@ -760,12 +579,32 @@ def make_graph(ev, iev): phi=caloparticle_phi[iobj], ispu=float(caloparticle_ev[iobj] != 0), ) + itp = caloparticle_idx_trackingparticle[iobj] + if itp != -1: + g.add_edge(("cp", iobj), ("tp", itp)) + + # SimClusters + for iobj in range(len(simcluster_pid)): + g.add_node( + ("sc", iobj), + pid=abs(simcluster_pid[iobj]), + pt=simcluster_pt[iobj], + eta=simcluster_eta[iobj], + phi=simcluster_phi[iobj], + e=simcluster_e[iobj], + ) + icp = simcluster_idx_caloparticle[iobj] + g.add_edge(("cp", icp), ("sc", iobj)) + + itp = simcluster_idx_trackingparticle[iobj] + if itp != -1: + g.add_edge(("sc", iobj), ("tp", itp)) # baseline PF for cross-checks for iobj in range(len(pfcandidate_pdgid)): g.add_node( ("pfcand", iobj), - typ=abs(pfcandidate_pdgid[iobj]), + pid=abs(pfcandidate_pdgid[iobj]), pt=pfcandidate_pt[iobj], e=pfcandidate_e[iobj], eta=pfcandidate_eta[iobj], @@ -773,61 +612,126 @@ def make_graph(ev, iev): cos_phi=np.cos(pfcandidate_phi[iobj]), charge=get_charge(pfcandidate_pdgid[iobj]), ispu=0.0, # for PF candidates, we don't know if it was PU or not - orig_pid=0, # placeholder to match processed gp ) trackingparticle_to_element_first = ev["trackingparticle_to_element.first"][iev] trackingparticle_to_element_second = ev["trackingparticle_to_element.second"][iev] trackingparticle_to_element_cmp = ev["trackingparticle_to_element_cmp"][iev] - # for trackingparticles associated to elements, set a very high edge weight - for tp, elem, c in zip( + for iobj, elem, c in zip( trackingparticle_to_element_first, trackingparticle_to_element_second, trackingparticle_to_element_cmp, ): - # ignore BREM, because the TrackingParticle is already linked to GSF - if g.nodes[("elem", elem)]["typ"] in [7]: + if g.nodes[("elem", elem)]["typ"] in [2, 3, 7]: continue - g.add_edge(("tp", tp), ("elem", elem), weight=c) + if ("tp", iobj) in g.nodes and ("elem", elem) in g.nodes: + # print(("tp", iobj), ("elem", elem), c) + g.add_edge(("tp", iobj), ("elem", elem), weight=c * g.nodes[("elem", elem)]["e"]) caloparticle_to_element_first = ev["caloparticle_to_element.first"][iev] caloparticle_to_element_second = ev["caloparticle_to_element.second"][iev] caloparticle_to_element_cmp = ev["caloparticle_to_element_cmp"][iev] - for sc, elem, c in zip( + for iobj, elem, c in zip( caloparticle_to_element_first, caloparticle_to_element_second, caloparticle_to_element_cmp, ): - if not (g.nodes[("elem", elem)]["typ"] in [7]): - g.add_edge(("cp", sc), ("elem", elem), weight=c) + if g.nodes[("elem", elem)]["typ"] in [2, 3, 7]: + continue + if ("cp", iobj) in g.nodes and ("elem", elem) in g.nodes: + # print(("cp", iobj), ("elem", elem), c) + g.add_edge(("cp", iobj), ("elem", elem), weight=c) + + simcluster_to_element_first = ev["simcluster_to_element.first"][iev] + simcluster_to_element_second = ev["simcluster_to_element.second"][iev] + simcluster_to_element_cmp = ev["simcluster_to_element_cmp"][iev] + for iobj, elem, c in zip( + simcluster_to_element_first, + simcluster_to_element_second, + simcluster_to_element_cmp, + ): + if not (g.nodes[("elem", elem)]["typ"] in [2, 3, 7]): + # print(("sc", iobj), ("elem", elem), c) + g.add_edge(("sc", iobj), ("elem", elem), weight=c) print("make_graph init, met={:.2f}".format(compute_gen_met(g))) - # merge caloparticles and trackingparticles that refer to the same particle + # add children of trackingparticle to parent + tps = [n for n in g.nodes if n[0] == "tp"] + for tp in tps: + preds = g.predecessors(tp) + sucs = g.successors(tp) + for pred in preds: + for suc in sucs: + if (pred, suc) not in g.edges: + g.add_edge(pred, suc, weight=g.edges[(tp, suc)]["weight"]) + g.remove_nodes_from(tps) + + # pickle.dump(g, open("init_g_{}.pkl".format(iev), "wb"), pickle.HIGHEST_PROTOCOL) + + # add any remaining links between CaloParticles and Elements using delta-R proximity + elems = [n for n in g.nodes if n[0] == "elem"] + scs = [node for node in g.nodes if node[0] == "sc"] + sc_coords = np.array([[g.nodes[n]["eta"] for n in scs], [g.nodes[n]["phi"] for n in scs]]) + tree = KDTree(sc_coords.T, leaf_size=32) + for elem in elems: + if len(list(g.predecessors(elem))) == 0 and g.nodes[elem]["pt"] > 1: + eta = g.nodes[elem]["eta"] + phi = g.nodes[elem]["phi"] + nearby_scs = tree.query_radius([[eta, phi]], 0.05)[0] + for isc in nearby_scs: + if scs[isc] in g.nodes: + if (scs[isc], elem) not in g.edges: + g.add_edge(scs[isc], elem, weight=g.nodes[elem]["e"]) + + # add children of simcluster to parent + scs = [n for n in g.nodes if n[0] == "sc"] + for sc in scs: + preds = g.predecessors(sc) + sucs = g.successors(sc) + for pred in preds: + for suc in sucs: + if (pred, suc) not in g.edges: + g.add_edge(pred, suc, weight=g.edges[(sc, suc)]["weight"]) + + g.remove_nodes_from(scs) + print("make_graph duplicates removed, met={:.2f}".format(compute_gen_met(g))) + + elems = [n for n in g.nodes if n[0] == "elem"] nodes_to_remove = [] - for idx_cp, idx_tp in enumerate(caloparticle_idx_trackingparticle): - if idx_tp != -1: - - # add all the edges from the trackingparticle to the caloparticle - for elem in g.neighbors(("tp", idx_tp)): - g.add_edge( - ("cp", idx_cp), - elem, - weight=g.edges[("tp", idx_tp), elem]["weight"], - ) - # remove the trackingparticle, keep the caloparticle - nodes_to_remove += [("tp", idx_tp)] + for elem in elems: + if g.nodes[elem]["typ"] in [2, 3, 7]: + nodes_to_remove.append(elem) g.remove_nodes_from(nodes_to_remove) - print("make_graph duplicates removed, met={:.2f}".format(compute_gen_met(g))) # merge_closeby_particles(g) - # print("cleanup done, met={:.2f}".format(compute_gen_met(g))) + print("cleanup done, met={:.2f}".format(compute_gen_met(g))) element_to_candidate_first = ev["element_to_candidate.first"][iev] element_to_candidate_second = ev["element_to_candidate.second"][iev] for elem, pfcand in zip(element_to_candidate_first, element_to_candidate_second): - g.add_edge(("elem", elem), ("pfcand", pfcand), weight=1.0) + if ("elem", elem) in g.nodes: + g.add_edge(("elem", elem), ("pfcand", pfcand), weight=1.0) + # pickle.dump(g, open("cleanup_g_{}.pkl".format(iev), "wb"), pickle.HIGHEST_PROTOCOL) + # print_gen(g) + return g + + +def cleanup_graph(g): + all_removed_edges = [] + elems = [n for n in g.nodes if n[0] == "elem"] + for elem in elems: + edges_to_remove = [] + for pred in g.predecessors(elem): + edge = (pred, elem) + if g.edges[edge]["weight"] / g.nodes[elem]["e"] < 0.1: + edges_to_remove.append(edge) + all_removed_edges += edges_to_remove + # print("removed edges:", all_removed_edges) + # for edge in all_removed_edges: + # print(g.nodes[edge[0]]["e"], g.nodes[edge[1]]["e"], g.edges[edge]["weight"]) + g.remove_edges_from(all_removed_edges) return g @@ -847,10 +751,10 @@ def process(args): for iev in tqdm.tqdm(events_to_process): print("processing iev={}, genmet_cmssw={:.2f}".format(iev, ev["genmet_pt"][iev][0])) g = make_graph(ev, iev) - g = cleanup_graph(g) + # g = cleanup_graph(g) # associate target particles to input elements - Xelem, ycand, ygen = prepare_normalized_table(g) + Xelem, ycand, ygen = prepare_normalized_table(g, iev) data = {} # produce a list of stable pythia particles for downstream validation @@ -860,7 +764,7 @@ def process(args): for n in g.nodes if n[0] == "gen" and ((g.nodes[n]["status"] == 1) or ((g.nodes[n]["status"] == 2) and g.nodes[n]["num_daughters"] == 0)) ] - feats = ["typ", "pt", "eta", "phi", "e"] + feats = ["pid", "pt", "eta", "phi", "e"] arr_ptcls_pythia = np.array([[g.nodes[n][f] for f in feats] for n in ptcls_pythia]) # produce pythia-level genjets and genmet @@ -885,10 +789,6 @@ def process(args): "genmet": genmet, } - # print("trk", ygen[Xelem["typ"] == 1]["typ"]) - # print("ecal", ygen[Xelem["typ"] == 4]["typ"]) - # print("hcal", ygen[Xelem["typ"] == 4]["typ"]) - if args.save_full_graph: data["full_graph"] = g diff --git a/mlpf/data_cms/postprocessing_jobs.py b/mlpf/data_cms/postprocessing_jobs.py index 76f70d313..685470a3d 100644 --- a/mlpf/data_cms/postprocessing_jobs.py +++ b/mlpf/data_cms/postprocessing_jobs.py @@ -8,7 +8,7 @@ def chunks(lst, n): yield lst[i : i + n] -def write_script(postprocessing_cmd, infiles, outfiles): +def write_script(infiles, outfiles): s = [] s += ["#!/bin/bash"] s += ["#SBATCH --partition short"] @@ -23,7 +23,7 @@ def write_script(postprocessing_cmd, infiles, outfiles): outf_no_bzip = outf.replace(".pkl.bz2", ".pkl") s += [f"if [ ! -f {outf} ]; then"] s += [ - " singularity exec -B /local /home/software/singularity/pytorch.simg:2024-06-26" + " singularity exec -B /local /home/software/singularity/pytorch.simg:2024-08-18" + f" python3 mlpf/data_cms/postprocessing2.py --input {inf} --outpath {outpath}" ] s += [f" bzip2 -z {outf_no_bzip}"] @@ -33,24 +33,14 @@ def write_script(postprocessing_cmd, infiles, outfiles): samples = [ - "/local/joosep/mlpf/cms/v3_3/nopu/SingleProtonMinusFlatPt0p7To1000_cfi", - "/local/joosep/mlpf/cms/v3_3/nopu/SingleMuFlatPt1To1000_pythia8_cfi", - "/local/joosep/mlpf/cms/v3_3/nopu/TTbar_14TeV_TuneCUETP8M1_cfi", - "/local/joosep/mlpf/cms/v3_3/nopu/SingleK0FlatPt1To1000_pythia8_cfi", - "/local/joosep/mlpf/cms/v3_3/nopu/SinglePi0Pt1To1000_pythia8_cfi", - "/local/joosep/mlpf/cms/v3_3/nopu/SingleGammaFlatPt1To1000_pythia8_cfi", - "/local/joosep/mlpf/cms/v3_3/nopu/SinglePiMinusFlatPt0p7To1000_cfi", - "/local/joosep/mlpf/cms/v3_3/nopu/SingleNeutronFlatPt0p7To1000_cfi", - "/local/joosep/mlpf/cms/v3_3/nopu/SingleElectronFlatPt1To1000_pythia8_cfi", - "/local/joosep/mlpf/cms/v3_3/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi", - "/local/joosep/mlpf/cms/v3_3/pu55to75/QCDForPF_14TeV_TuneCUETP8M1_cfi", + "/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi", ] ichunk = 1 for sample in samples: - infiles = list(glob.glob(f"{sample}/root/*.root")) + infiles = list(glob.glob(f"{sample}/root/pfntuple*.root")) for infiles_chunk in chunks(infiles, 10): - outfiles_chunk = [inf.replace(".root", ".pkl.bz2").replace("/root/", "/raw_orig/") for inf in infiles_chunk] + outfiles_chunk = [inf.replace(".root", ".pkl.bz2").replace("/root/", "/raw2/") for inf in infiles_chunk] os.makedirs(os.path.dirname(outfiles_chunk[0]), exist_ok=True) scr = write_script(infiles_chunk, outfiles_chunk) ofname = f"jobscripts/postproc_{ichunk}.sh" diff --git a/mlpf/data_cms/prepare_args.py b/mlpf/data_cms/prepare_args.py index 0b1d7a557..a07892e56 100644 --- a/mlpf/data_cms/prepare_args.py +++ b/mlpf/data_cms/prepare_args.py @@ -3,30 +3,30 @@ import os -outdir = "/local/joosep/mlpf/cms/20240702_cptruthdef" +outdir = "/local/joosep/mlpf/cms/20240823_simcluster" samples = [ - ("TTbar_14TeV_TuneCUETP8M1_cfi", 100000, 120010, "genjob_pu55to75.sh", outdir + "/pu55to75"), +# ("TTbar_14TeV_TuneCUETP8M1_cfi", 105000, 110010, "genjob_pu55to75.sh", outdir + "/pu55to75"), # ("ZTT_All_hadronic_14TeV_TuneCUETP8M1_cfi", 200000, 220010, "genjob_pu55to75.sh", outdir + "/pu55to75"), - ("QCDForPF_14TeV_TuneCUETP8M1_cfi", 300000, 320010, "genjob_pu55to75.sh", outdir + "/pu55to75"), + ("QCDForPF_14TeV_TuneCUETP8M1_cfi", 300000, 305000, "genjob_pu55to75.sh", outdir + "/pu55to75"), # ("SMS-T1tttt_mGl-1500_mLSP-100_TuneCP5_14TeV_pythia8_cfi", 500000, 520010, "genjob_pu55to75.sh", outdir + "/pu55to75"), # ("ZpTT_1500_14TeV_TuneCP5_cfi", 600000, 620010, "genjob_pu55to75.sh", outdir + "/pu55to75"), - ("VBF_TuneCP5_14TeV_pythia8_cfi", 700000, 720010, "genjob_pu55to75.sh", outdir + "/pu55to75"), - - ("TTbar_14TeV_TuneCUETP8M1_cfi", 700000, 720010, "genjob_nopu.sh", outdir + "/nopu"), - ("MultiParticlePFGun50_cfi", 800000, 820000, "genjob_nopu.sh", outdir + "/nopu"), - ("VBF_TuneCP5_14TeV_pythia8_cfi", 900000, 920010, "genjob_nopu.sh", outdir + "/nopu"), - ("QCDForPF_14TeV_TuneCUETP8M1_cfi", 1000000,1020010, "genjob_nopu.sh", outdir + "/nopu"), - - ("SingleElectronFlatPt1To1000_pythia8_cfi", 900000, 901000, "genjob_nopu.sh", outdir + "/nopu"), -# ("SingleGammaFlatPt1To1000_pythia8_cfi", 1000000,1000100, "genjob_nopu.sh", outdir + "/nopu"), -# ("SingleMuFlatPt1To1000_pythia8_cfi", 1100000,1100100, "genjob_nopu.sh", outdir + "/nopu"), -# ("SingleNeutronFlatPt0p7To1000_cfi", 1200000,1200100, "genjob_nopu.sh", outdir + "/nopu"), -# ("SinglePi0Pt1To1000_pythia8_cfi", 1300000,1300100, "genjob_nopu.sh", outdir + "/nopu"), -# ("SinglePiMinusFlatPt0p7To1000_cfi", 1400000,1400100, "genjob_nopu.sh", outdir + "/nopu"), -# ("SingleProtonMinusFlatPt0p7To1000_cfi", 1500000,1500100, "genjob_nopu.sh", outdir + "/nopu"), -# ("SingleTauFlatPt1To1000_cfi", 1600000,1610000, "genjob_nopu.sh", outdir + "/nopu"), -# ("SingleK0FlatPt1To1000_pythia8_cfi", 1700000,1700100, "genjob_nopu.sh", outdir + "/nopu"), +# ("VBF_TuneCP5_14TeV_pythia8_cfi", 700000, 720010, "genjob_pu55to75.sh", outdir + "/pu55to75"), + +# ("TTbar_14TeV_TuneCUETP8M1_cfi", 702000, 705000, "genjob_nopu.sh", outdir + "/nopu"), +# ("MultiParticlePFGun50_cfi", 800000, 820000, "genjob_nopu.sh", outdir + "/nopu"), +# ("VBF_TuneCP5_14TeV_pythia8_cfi", 900000, 920010, "genjob_nopu.sh", outdir + "/nopu"), +# ("QCDForPF_14TeV_TuneCUETP8M1_cfi", 1000000,1020010, "genjob_nopu.sh", outdir + "/nopu"), + +# ("SingleElectronFlatPt1To1000_pythia8_cfi", 900000, 900010, "genjob_nopu.sh", outdir + "/nopu"), +# ("SingleGammaFlatPt1To1000_pythia8_cfi", 1000000,1000010, "genjob_nopu.sh", outdir + "/nopu"), +# ("SingleMuFlatPt1To1000_pythia8_cfi", 1100000,1100010, "genjob_nopu.sh", outdir + "/nopu"), +# ("SingleNeutronFlatPt0p7To1000_cfi", 1200000,1200010, "genjob_nopu.sh", outdir + "/nopu"), +# ("SinglePi0Pt1To1000_pythia8_cfi", 1300000,1300010, "genjob_nopu.sh", outdir + "/nopu"), +# ("SinglePiMinusFlatPt0p7To1000_cfi", 1400000,1400010, "genjob_nopu.sh", outdir + "/nopu"), +# ("SingleProtonMinusFlatPt0p7To1000_cfi", 1500000,1500010, "genjob_nopu.sh", outdir + "/nopu"), +# ("SingleTauFlatPt1To1000_cfi", 1600000,1600010, "genjob_nopu.sh", outdir + "/nopu"), +# ("SingleK0FlatPt1To1000_pythia8_cfi", 1700000,1700010, "genjob_nopu.sh", outdir + "/nopu"), ] if __name__ == "__main__": @@ -36,6 +36,6 @@ os.makedirs(this_outdir + "/" + samp + "/root", exist_ok=True) for seed in range(seed0, seed1): - p = this_outdir + "/" + samp + "/raw/pfntuple_{}.pkl.bz2".format(seed) + p = this_outdir + "/" + samp + "/raw2/pfntuple_{}.pkl.bz2".format(seed) if not os.path.isfile(p): print(f"sbatch --mem-per-cpu 8G --partition main --time 20:00:00 --cpus-per-task 1 scripts/tallinn/cmssw-el8.sh mlpf/data_cms/{script} {samp} {seed}") diff --git a/mlpf/heptfds/cms_pf/cms_utils.py b/mlpf/heptfds/cms_pf/cms_utils.py index db0f8fe67..4f1b8cce9 100644 --- a/mlpf/heptfds/cms_pf/cms_utils.py +++ b/mlpf/heptfds/cms_pf/cms_utils.py @@ -144,8 +144,8 @@ def prepare_data_cms(fn): Xelem["sin_phi"] = np.sin(Xelem["phi"]) Xelem["cos_phi"] = np.cos(Xelem["phi"]) Xelem["typ_idx"] = np.array([ELEM_LABELS_CMS.index(int(i)) for i in Xelem["typ"]], dtype=np.float32) - ygen["typ_idx"] = np.array([CLASS_LABELS_CMS.index(abs(int(i))) for i in ygen["typ"]], dtype=np.float32) - ycand["typ_idx"] = np.array([CLASS_LABELS_CMS.index(abs(int(i))) for i in ycand["typ"]], dtype=np.float32) + ygen["typ_idx"] = np.array([CLASS_LABELS_CMS.index(abs(int(i))) for i in ygen["pid"]], dtype=np.float32) + ycand["typ_idx"] = np.array([CLASS_LABELS_CMS.index(abs(int(i))) for i in ycand["pid"]], dtype=np.float32) Xelem_flat = ak.to_numpy( np.stack( diff --git a/mlpf/heptfds/cms_pf/ttbar.py b/mlpf/heptfds/cms_pf/ttbar.py index 2eec180d7..077ed97fd 100644 --- a/mlpf/heptfds/cms_pf/ttbar.py +++ b/mlpf/heptfds/cms_pf/ttbar.py @@ -21,7 +21,7 @@ class CmsPfTtbar(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf dataset.""" - VERSION = tfds.core.Version("2.1.0") + VERSION = tfds.core.Version("2.2.0") RELEASE_NOTES = { "1.0.0": "Initial release.", "1.1.0": "Add muon type, fix electron GSF association", @@ -37,6 +37,7 @@ class CmsPfTtbar(tfds.core.GeneratorBasedBuilder): "1.8.0": "Add ispu, genjets, genmet; disable genjet_idx; improved merging", "2.0.0": "New truth def based primarily on CaloParticles", "2.1.0": "Additional stats", + "2.2.0": "Split CaloParticles along tracks", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_ttbar ~/tensorflow_datasets/ diff --git a/mlpf/heptfds/cms_pf/ttbar_nopu.py b/mlpf/heptfds/cms_pf/ttbar_nopu.py index 0879ebb7f..26158b459 100644 --- a/mlpf/heptfds/cms_pf/ttbar_nopu.py +++ b/mlpf/heptfds/cms_pf/ttbar_nopu.py @@ -21,11 +21,12 @@ class CmsPfTtbarNopu(tfds.core.GeneratorBasedBuilder): """DatasetBuilder for cms_pf_ttbar_nopu dataset.""" - VERSION = tfds.core.Version("2.0.0") + VERSION = tfds.core.Version("2.2.0") RELEASE_NOTES = { "1.7.1": "First version", "1.8.0": "Add ispu, genjets, genmet; disable genjet_idx; improved merging", "2.0.0": "New truth def based primarily on CaloParticles", + "2.2.0": "Split CaloParticles along tracks", } MANUAL_DOWNLOAD_INSTRUCTIONS = """ rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/tensorflow_datasets/cms/cms_pf_ttbar_nopu ~/tensorflow_datasets/ diff --git a/mlpf/pyg/PFDataset.py b/mlpf/pyg/PFDataset.py index 7ca195e43..22bcec6a4 100644 --- a/mlpf/pyg/PFDataset.py +++ b/mlpf/pyg/PFDataset.py @@ -35,12 +35,21 @@ def __getitem__(self, item): ret["ygen"] = ret["ygen"][sortidx] if self.ds.dataset_info.name.startswith("cms_"): + # track, target label neutral hadron -> reconstruct as charged hadron ret["ygen"][:, 0][(ret["X"][:, 0] == 1) & (ret["ygen"][:, 0] == 2)] = 1 + + # track, target label photon -> reconstruct as charged hadron ret["ygen"][:, 0][(ret["X"][:, 0] == 1) & (ret["ygen"][:, 0] == 5)] = 1 + # ECAL cluster, target label charged hadron -> reconstruct as photon ret["ygen"][:, 0][(ret["X"][:, 0] == 4) & (ret["ygen"][:, 0] == 1)] = 5 + + # HCAL cluster, target label charged hadron -> reconstruct as neutral hadron ret["ygen"][:, 0][(ret["X"][:, 0] == 5) & (ret["ygen"][:, 0] == 1)] = 2 + + # ECAL cluster, target label electron -> reconstruct as photon # ret["ygen"][:, 0][(ret["X"][:, 0]==4) & (ret["ygen"][:, 0] == 6)] = 5 + ret["ygen"][:, 0][(ret["X"][:, 0] == 5) & (ret["ygen"][:, 0] == 6)] = 2 ret["ygen"][:, 0][(ret["X"][:, 0] == 4) & (ret["ygen"][:, 0] == 7)] = 5 ret["ygen"][:, 0][(ret["X"][:, 0] == 5) & (ret["ygen"][:, 0] == 7)] = 2 diff --git a/mlpf/pyg/training.py b/mlpf/pyg/training.py index abcb9c5a2..0100a9a72 100644 --- a/mlpf/pyg/training.py +++ b/mlpf/pyg/training.py @@ -78,6 +78,7 @@ def mlpf_loss(y, ypred, batch): loss = {} loss_obj_id = FocalLoss(gamma=2.0, reduction="none") + msk_pred_particle = torch.unsqueeze((ypred["cls_id"] != 0).to(dtype=torch.float32), axis=-1) msk_true_particle = torch.unsqueeze((y["cls_id"] != 0).to(dtype=torch.float32), axis=-1) nelem = torch.sum(batch.mask) npart = torch.sum(y["cls_id"] != 0) @@ -126,13 +127,23 @@ def mlpf_loss(y, ypred, batch): pred_met = torch.sum(px, axis=-2) ** 2 + torch.sum(py, axis=-2) ** 2 loss["MET"] = torch.nn.functional.huber_loss(pred_met.squeeze(dim=-1), batch.genmet).mean() - loss["Sliced_Wasserstein_Loss"] = sliced_wasserstein_loss(ypred["momentum"].detach(), y["momentum"]).mean() - loss["Total"] = loss["Classification_binary"] + loss["Classification"] + loss["Regression"] + was_input_pred = torch.concat([torch.softmax(ypred["cls_binary"].transpose(1, 2), axis=-1), ypred["momentum"]], axis=-1) * batch.mask.unsqueeze( + axis=-1 + ) + was_input_true = torch.concat([torch.nn.functional.one_hot((y["cls_id"] != 0).to(torch.long)), y["momentum"]], axis=-1) * batch.mask.unsqueeze( + axis=-1 + ) + + std = was_input_true[batch.mask].std(axis=0) + loss["Sliced_Wasserstein_Loss"] = sliced_wasserstein_loss(was_input_pred / std, was_input_true / std).mean() + + loss["Total"] = loss["Classification_binary"] + loss["Classification"] + loss["Regression"] # + 0.01 * loss["Sliced_Wasserstein_Loss"] loss["Classification_binary"] = loss["Classification_binary"].detach() loss["Classification"] = loss["Classification"].detach() loss["Regression"] = loss["Regression"].detach() + loss["Sliced_Wasserstein_Loss"] = loss["Sliced_Wasserstein_Loss"].detach() return loss diff --git a/notebooks/cms/cms-dataset.ipynb b/notebooks/cms/cms-dataset.ipynb index 36706672f..62d985a2f 100644 --- a/notebooks/cms/cms-dataset.ipynb +++ b/notebooks/cms/cms-dataset.ipynb @@ -12,6 +12,7 @@ "import awkward\n", "import numpy as np\n", "import fastjet\n", + "import tqdm\n", "\n", "import matplotlib.pyplot as plt" ] @@ -44,7 +45,8 @@ "metadata": {}, "outputs": [], "source": [ - "builder = tfds.builder(\"cms_pf_ttbar\", data_dir=\"/scratch/persistent/joosep/tensorflow_datasets/\")\n", + "ds_string = \"cms_pf_ttbar:2.2.0\"\n", + "builder = tfds.builder(ds_string, data_dir=\"/scratch/persistent/joosep/tensorflow_datasets/\")\n", "ds_train = builder.as_data_source(split=\"train\")" ] }, @@ -56,32 +58,43 @@ "outputs": [], "source": [ "all_genjets = []\n", - "all_genparticles = []\n", + "all_genparticles_awk = []\n", + "all_Xs = []\n", "\n", "#loop over some events in the dataset\n", - "for iev in range(100):\n", + "for iev in tqdm.tqdm(list(range(5000))):\n", " el = ds_train[iev]\n", - " print(len(el[\"X\"]), el.keys())\n", + " # print(len(el[\"X\"]), el.keys())\n", " \n", " genjets = vector.awk(awkward.zip({\"pt\": el[\"genjets\"][:, 0], \"eta\": el[\"genjets\"][:, 1], \"phi\": el[\"genjets\"][:, 2], \"e\": el[\"genjets\"][:, 3]}))\n", - " mask_genparticles = el[\"ygen\"][:, 0]!=0\n", - " genparticles = el[\"ygen\"][mask_genparticles]\n", + " genparticles = el[\"ygen\"]\n", " \n", " gp_phi = np.arctan2(genparticles[:, 4], genparticles[:, 5]) #sphi,cphi -> phi\n", " genparticles_p4 = vector.awk(awkward.zip({\"pt\": genparticles[:, 2], \"eta\": genparticles[:, 3], \"phi\": gp_phi, \"e\": genparticles[:, 6]}))\n", " gp_ispu = genparticles[:, 7]\n", " gp_pid = np.array(CLASS_LABELS_CMS)[genparticles[:, 0].astype(np.int64)]\n", - " genparticles = awkward.Record({\n", + " genparticles_awk = awkward.Array({\n", " \"pid\": gp_pid,\n", " \"p4\": genparticles_p4,\n", " \"ispu\": genparticles[:, 7],\n", " })\n", "\n", + " all_Xs.append(el[\"X\"])\n", " all_genjets.append(genjets)\n", - " all_genparticles.append(genparticles)\n", - "\n", - "all_genjets = awkward.from_iter(all_genjets)\n", - "all_genparticles = awkward.from_iter(all_genparticles)" + " all_genparticles_awk.append(genparticles_awk)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "815302bd-3e2b-45e9-8263-de09265db7bb", + "metadata": {}, + "outputs": [], + "source": [ + "all_Xs = awkward.unflatten(awkward.from_numpy(np.concatenate(all_Xs, axis=0)), counts=[len(x) for x in all_Xs])\n", + "all_genjets = awkward.unflatten(awkward.concatenate(all_genjets), counts=[len(x) for x in all_genjets])\n", + "all_genparticles_awk = awkward.unflatten(awkward.concatenate(all_genparticles_awk), counts=[len(x) for x in all_genparticles_awk])\n", + "all_genparticles_no0 = all_genparticles_awk[all_genparticles_awk[\"pid\"]!=0]" ] }, { @@ -94,10 +107,10 @@ "p4 = vector.awk(\n", " awkward.zip(\n", " {\n", - " \"pt\": all_genparticles.p4.rho,\n", - " \"eta\": all_genparticles.p4.eta,\n", - " \"phi\": all_genparticles.p4.phi,\n", - " \"e\": all_genparticles.p4.t,\n", + " \"pt\": all_genparticles_no0.p4.rho,\n", + " \"eta\": all_genparticles_no0.p4.eta,\n", + " \"phi\": all_genparticles_no0.p4.phi,\n", + " \"e\": all_genparticles_no0.p4.t,\n", " }\n", " )\n", ")" @@ -112,10 +125,10 @@ "source": [ "jetdef = fastjet.JetDefinition(fastjet.antikt_algorithm, 0.4)\n", "cluster = fastjet.ClusterSequence(p4.to_xyzt(), jetdef)\n", - "jets = cluster.inclusive_jets(min_pt=10)\n", + "jets = cluster.inclusive_jets(min_pt=5)\n", "\n", - "cluster = fastjet.ClusterSequence(p4.to_xyzt()[all_genparticles.ispu==0], jetdef)\n", - "jets_nopu = cluster.inclusive_jets(min_pt=10)" + "cluster = fastjet.ClusterSequence(p4.to_xyzt()[all_genparticles_no0.ispu==0], jetdef)\n", + "jets_nopu = cluster.inclusive_jets(min_pt=5)" ] }, { @@ -125,12 +138,86 @@ "metadata": {}, "outputs": [], "source": [ - "b = np.linspace(10,100,100)\n", + "b = np.logspace(0,3,100)\n", "plt.hist(awkward.flatten(all_genjets.rho), bins=b, histtype=\"step\", label=\"genjets\");\n", "plt.hist(awkward.flatten(jets.pt), bins=b, histtype=\"step\", label=\"all gp jets\");\n", "plt.hist(awkward.flatten(jets_nopu.pt), bins=b, histtype=\"step\", label=\"ispu=0 gp jets\");\n", - "plt.legend()" + "plt.legend()\n", + "plt.yscale(\"log\")\n", + "plt.xscale(\"log\")\n", + "plt.xlabel(\"jet pt\")\n", + "plt.ylabel(\"number of jets\")\n", + "plt.title(ds_string)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd105f3e-61af-46f1-bb84-da196f57a391", + "metadata": {}, + "outputs": [], + "source": [ + "gen_pid = awkward.flatten(all_genparticles_awk[all_Xs[:, :, 0]==1][\"pid\"])\n", + "gen_pt = awkward.flatten(all_genparticles_awk[all_Xs[:, :, 0]==1][\"p4\"].rho)\n", + "track_pt = awkward.flatten(all_Xs[all_Xs[:, :, 0]==1][:, :, 1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ae5d52d-f1c3-4caa-816a-a38453812998", + "metadata": {}, + "outputs": [], + "source": [ + "b = np.logspace(-1,2,100)\n", + "\n", + "plt.figure(figsize=(6,6))\n", + "plt.hist2d(\n", + " awkward.to_numpy(track_pt[gen_pid!=0]),\n", + " awkward.to_numpy(gen_pt[gen_pid!=0]),\n", + " bins=b\n", + ")\n", + "plt.plot([0.1, 100], [0.1, 100], color=\"black\", ls=\"--\", lw=1.0)\n", + "plt.xscale(\"log\")\n", + "plt.yscale(\"log\")\n", + "plt.xlabel(\"track pt\")\n", + "plt.ylabel(\"gen pt\")\n", + "plt.title(ds_string)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c7d5bf0-a752-43f5-b849-b979c0118388", + "metadata": {}, + "outputs": [], + "source": [ + "bins = np.logspace(-1, 3, 20)\n", + "fracs_gen = []\n", + "\n", + "for ibin in range(len(bins)-1):\n", + " b0 = bins[ibin]\n", + " b1 = bins[ibin+1]\n", + " msk = (track_pt >= b0) & (track_pt < b1)\n", + " frac_gen = np.sum(gen_pid[msk]!=0) / np.sum(msk)\n", + " fracs_gen.append(frac_gen)\n", + "\n", + "plt.figure()\n", + "plt.plot(bins[:-1], fracs_gen, marker=\"o\")\n", + "plt.xscale(\"log\")\n", + "plt.xlabel(\"track pT\")\n", + "plt.ylabel(\"fraction of tracks matched to gen\")\n", + "plt.ylim(0,1)\n", + "plt.title(ds_string)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "baa4bb0c-35bf-4232-8b93-15b80ba5f20b", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -149,7 +236,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/notebooks/cms/cms-simvalidation.ipynb b/notebooks/cms/cms-simvalidation.ipynb index 7485c2851..39cb600d6 100644 --- a/notebooks/cms/cms-simvalidation.ipynb +++ b/notebooks/cms/cms-simvalidation.ipynb @@ -30,6 +30,8 @@ "import bz2\n", "import os\n", "import tqdm\n", + "import fastjet\n", + "import vector\n", "\n", "mplhep.style.use(\"CMS\")" ] @@ -62,6 +64,9 @@ "outputs": [], "source": [ "import sys\n", + "sys.path += [\"../../mlpf/\"]\n", + "\n", + "import jet_utils\n", "\n", "sys.path += [\"../../mlpf/plotting/\"]\n", "\n", @@ -71,16 +76,6 @@ "from plot_utils import pid_to_text" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "ce1e996f", - "metadata": {}, - "outputs": [], - "source": [ - "!ls -lrt /local/joosep/mlpf/cms/20240702_cptruthdef/nopu/TTbar_14TeV_TuneCUETP8M1_cfi/raw" - ] - }, { "cell_type": "code", "execution_count": null, @@ -90,7 +85,7 @@ "source": [ "sample = \"TTbar_14TeV_TuneCUETP8M1_cfi\"\n", "\n", - "maxfiles = 50\n", + "maxfiles = 100\n", "if sample.startswith(\"Single\"):\n", " maxfiles = 50\n", "\n", @@ -99,6 +94,16 @@ " os.makedirs(plot_outpath)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "020e1e9d-058e-473b-933d-2c8c965b04ff", + "metadata": {}, + "outputs": [], + "source": [ + "list(glob.glob(\"/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/{}/raw2/*.pkl.bz2\".format(sample)))[:maxfiles][0]" + ] + }, { "cell_type": "code", "execution_count": null, @@ -109,7 +114,7 @@ "pickle_data = sum(\n", " [\n", " pickle.load(bz2.BZ2File(f, \"r\"))\n", - " for f in tqdm.tqdm(list(glob.glob(\"/local/joosep/mlpf/cms/20240702_cptruthdef/nopu/{}/*/*.pkl.bz2\".format(sample)))[:maxfiles])\n", + " for f in tqdm.tqdm(list(glob.glob(\"/local/joosep/mlpf/cms/20240823_simcluster/pu55to75/{}/raw2/*.pkl.bz2\".format(sample)))[:maxfiles])\n", " ],\n", " [],\n", ")\n", @@ -130,7 +135,7 @@ "source": [ "arrs_awk = {}\n", "arrs_flat = {}\n", - "for coll in [\"Xelem\", \"ygen\", \"ycand\"]:\n", + "for coll in [\"Xelem\"]:\n", " arrs_awk[coll] = {}\n", " arrs_flat[coll] = {}\n", " for feat in [\"typ\", \"pt\", \"eta\", \"phi\", \"e\"]:\n", @@ -139,9 +144,18 @@ " )\n", " arrs_flat[coll][feat] = awkward.from_regular([np.array(p[coll][feat].tolist()) for p in pickle_data])\n", "\n", + "for coll in [\"ygen\", \"ycand\"]:\n", + " arrs_awk[coll] = {}\n", + " arrs_flat[coll] = {}\n", + " for feat in [\"pid\", \"pt\", \"eta\", \"phi\", \"e\"]:\n", + " arrs_awk[coll][feat] = awkward.from_regular(\n", + " [np.array(p[coll][feat][p[coll][\"pid\"] != 0].tolist()) for p in pickle_data]\n", + " )\n", + " arrs_flat[coll][feat] = awkward.from_regular([np.array(p[coll][feat].tolist()) for p in pickle_data])\n", + "\n", "if \"pythia\" in pickle_data[0].keys():\n", " arrs_flat[\"pythia\"] = {}\n", - " for ifeat, feat in enumerate([\"typ\", \"pt\", \"eta\", \"phi\", \"e\"]):\n", + " for ifeat, feat in enumerate([\"pid\", \"pt\", \"eta\", \"phi\", \"e\"]):\n", " # arrs_awk[\"pythia\"][feat] = awkward.from_regular(\n", " # [np.array(p[\"pythia\"][:, ifeat][p[coll][:, 0]!=0].tolist()) for p in pickle_data]\n", " # )\n", @@ -152,7 +166,180 @@ " )\n", " arr.append(np.array(p[\"pythia\"][:, ifeat][mask_invis].tolist()))\n", "\n", - " arrs_flat[\"pythia\"][feat] = awkward.from_regular(arr)" + " arrs_flat[\"pythia\"][feat] = awkward.from_regular(arr)\n", + "\n", + "\n", + "genmet_cmssw = np.array([pickle_data[i][\"genmet\"][0, 0] for i in range(len(pickle_data))])\n", + "genjet_cmssw = awkward.from_iter([pickle_data[i][\"genjet\"] for i in range(len(pickle_data))])\n", + "genjet_cmssw = vector.awk(\n", + " awkward.zip(\n", + " { \n", + " \"pt\": genjet_cmssw[:, :, 0],\n", + " \"eta\": genjet_cmssw[:, :, 1],\n", + " \"phi\": genjet_cmssw[:, :, 2],\n", + " \"e\": genjet_cmssw[:, :, 3],\n", + " }\n", + " )\n", + ")\n", + "\n", + "ygen_met = np.sqrt(awkward.sum(\n", + " (arrs_awk[\"ygen\"][\"pt\"] * np.sin(arrs_awk[\"ygen\"][\"phi\"]))**2 + (arrs_awk[\"ygen\"][\"pt\"] * np.cos(arrs_awk[\"ygen\"][\"phi\"]))**2,\n", + " axis=1\n", + "))\n", + "\n", + "ycand_met = np.sqrt(awkward.sum(\n", + " (arrs_awk[\"ycand\"][\"pt\"] * np.sin(arrs_awk[\"ycand\"][\"phi\"]))**2 + (arrs_awk[\"ycand\"][\"pt\"] * np.cos(arrs_awk[\"ycand\"][\"phi\"]))**2,\n", + " axis=1\n", + "))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3808c046-7b2b-48c2-9952-f52961f70fde", + "metadata": {}, + "outputs": [], + "source": [ + "msk_trk = (arrs_flat[\"Xelem\"][\"typ\"]==1) & (arrs_flat[\"Xelem\"][\"pt\"]>10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c25a98f9-e965-4a48-a69e-78ae6493d371", + "metadata": {}, + "outputs": [], + "source": [ + "arrs_flat[\"ygen\"][\"pid\"][msk_trk]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f91ede7c-d9f0-4f27-95c6-fcffdcc73ee6", + "metadata": {}, + "outputs": [], + "source": [ + "arrs_flat[\"ycand\"][\"pid\"][msk_trk]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66276184-9ae1-49ed-803c-929ec7aeb9e2", + "metadata": {}, + "outputs": [], + "source": [ + "jets_coll = {}\n", + "jets_coll[\"cmssw\"] = genjet_cmssw\n", + "\n", + "for coll in [\"ygen\", \"ycand\"]:\n", + " vec = vector.awk(\n", + " awkward.zip(\n", + " { \n", + " \"pt\": arrs_awk[coll][\"pt\"],\n", + " \"eta\": arrs_awk[coll][\"eta\"],\n", + " \"phi\": arrs_awk[coll][\"phi\"],\n", + " \"e\": arrs_awk[coll][\"e\"],\n", + " }\n", + " )\n", + " )\n", + " jetdef = fastjet.JetDefinition(fastjet.antikt_algorithm, 0.4)\n", + " cluster = fastjet.ClusterSequence(vec.to_xyzt(), jetdef)\n", + " jets_coll[coll] = cluster.inclusive_jets(min_pt=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24de1b68-cc15-4d33-8465-db85c79d5e65", + "metadata": {}, + "outputs": [], + "source": [ + "cmssw_to_ygen = jet_utils.match_two_jet_collections(jets_coll, \"cmssw\", \"ygen\", 0.1)\n", + "cmssw_to_ycand = jet_utils.match_two_jet_collections(jets_coll, \"cmssw\", \"ycand\", 0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e89ca76e-7c4f-4db4-b309-5a167e7bf31b", + "metadata": {}, + "outputs": [], + "source": [ + "print(len(awkward.flatten(cmssw_to_ygen[\"cmssw\"])))\n", + "print(len(awkward.flatten(cmssw_to_ycand[\"cmssw\"])))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d80c62f4-0105-4e62-ab2f-2da42d0f8f15", + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10,10))\n", + "b = np.linspace(0,3,100)\n", + "plt.hist(\n", + " awkward.flatten(\n", + " jets_coll[\"ygen\"][cmssw_to_ygen[\"ygen\"]].pt / jets_coll[\"cmssw\"][cmssw_to_ygen[\"cmssw\"]].pt\n", + " ), bins=b, histtype=\"step\", lw=2\n", + ")\n", + "\n", + "plt.hist(\n", + " awkward.flatten(\n", + " jets_coll[\"ycand\"][cmssw_to_ycand[\"ycand\"]].pt / jets_coll[\"cmssw\"][cmssw_to_ycand[\"cmssw\"]].pt\n", + " ), bins=b, histtype=\"step\", lw=2\n", + ")\n", + "plt.yscale(\"log\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee6da234-1e8b-4ac5-8608-1c82c481c232", + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10, 10))\n", + "b = np.logspace(1, 3, 101)\n", + "plt.hist(genmet_cmssw, bins=b, histtype=\"step\", lw=2)\n", + "plt.hist(ygen_met, bins=b, histtype=\"step\", lw=2)\n", + "plt.hist(ycand_met, bins=b, histtype=\"step\", lw=2)\n", + "plt.xscale(\"log\")\n", + "plt.yscale(\"log\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e916f8dd-a0a8-4aa9-a5b2-56fb7ad406c2", + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10, 10))\n", + "b = np.logspace(1, 4, 61)\n", + "plt.hist((ygen_met/genmet_cmssw)[genmet_cmssw<1], bins=b, histtype=\"step\", lw=2)\n", + "plt.hist((ycand_met/genmet_cmssw)[genmet_cmssw<1], bins=b, histtype=\"step\", lw=2)\n", + "plt.xscale(\"log\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "953ad71f-88a3-48d2-90ee-986e2f07a7ab", + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10, 10))\n", + "b = np.logspace(-1,2,101)\n", + "plt.hist((ygen_met/genmet_cmssw)[genmet_cmssw>1], bins=b, histtype=\"step\", lw=2)\n", + "plt.hist((ycand_met/genmet_cmssw)[genmet_cmssw>1], bins=b, histtype=\"step\", lw=2)\n", + "plt.xscale(\"log\")\n", + "plt.show()" ] }, { @@ -189,6 +376,7 @@ " cms_label(ax)\n", " plt.ylim(1, 1e5)\n", " #sample_label(ax, sample)\n", + " plt.show()\n", " plt.savefig(plot_outpath + \"all_pt.pdf\", bbox_inches=\"tight\")" ] }, @@ -214,6 +402,7 @@ " cms_label(ax)\n", " #sample_label(ax, sample)\n", " plt.ylim(1, 1e3)\n", + " plt.show()\n", " plt.savefig(plot_outpath + \"all_sume.pdf\", bbox_inches=\"tight\")" ] }, @@ -332,9 +521,9 @@ " 0\n", "]:\n", " if pid == 0:\n", - " msk = arrs_flat[\"ygen\"][\"typ\"] != pid\n", + " msk = arrs_flat[\"ygen\"][\"pid\"] != pid\n", " else:\n", - " msk = arrs_flat[\"ygen\"][\"typ\"] == pid\n", + " msk = arrs_flat[\"ygen\"][\"pid\"] == pid\n", " data1 = awkward.to_numpy(awkward.flatten(arrs_flat[\"Xelem\"][\"eta\"][msk]))\n", " data2 = awkward.to_numpy(awkward.flatten(arrs_flat[\"ygen\"][\"eta\"][msk]))\n", "\n", @@ -416,28 +605,17 @@ "source": [ "Xelem_typ_f = np.array(awkward.flatten(arrs_flat[\"Xelem\"][\"typ\"]))\n", "\n", - "ygen_typ_f = np.array(awkward.flatten(arrs_flat[\"ygen\"][\"typ\"]))\n", + "ygen_typ_f = np.array(awkward.flatten(arrs_flat[\"ygen\"][\"pid\"]))\n", "ygen_typ_id = np.zeros(len(ygen_typ_f), dtype=np.int32)\n", "for i in range(len(CLASS_LABELS_CMS)):\n", " ygen_typ_id[ygen_typ_f == CLASS_LABELS_CMS[i]] = i\n", "\n", - "ycand_typ_f = np.array(awkward.flatten(arrs_flat[\"ycand\"][\"typ\"]))\n", + "ycand_typ_f = np.array(awkward.flatten(arrs_flat[\"ycand\"][\"pid\"]))\n", "ycand_typ_id = np.zeros(len(ycand_typ_f), dtype=np.int32)\n", "for i in range(len(CLASS_LABELS_CMS)):\n", " ycand_typ_id[ycand_typ_f == CLASS_LABELS_CMS[i]] = i" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "4fd92d6f-bb46-4634-ae0c-2fe9de614a89", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "ygen_typ_f" - ] - }, { "cell_type": "code", "execution_count": null, @@ -455,7 +633,7 @@ "metadata": {}, "outputs": [], "source": [ - "np.unique(Xelem_typ_f[ygen_typ_f == 11], return_counts=True)" + "np.unique(ygen_typ_id[Xelem_typ_f == 9], return_counts=True)" ] }, { @@ -522,6 +700,7 @@ "\n", "plt.xlabel(\"$p_T$ [GeV]\")\n", "plt.ylabel(\"Number of particles\")\n", + "plt.show()\n", "plt.savefig(plot_outpath + \"pf_vs_truth_pt.pdf\", bbox_inches=\"tight\")" ] }, @@ -550,6 +729,7 @@ "\n", "plt.xlabel(\"particle $\\eta$\")\n", "plt.ylabel(\"Number of particles\")\n", + "plt.show()\n", "plt.savefig(plot_outpath + \"pf_vs_truth_eta.pdf\", bbox_inches=\"tight\")" ] }, @@ -564,11 +744,11 @@ "ax = plt.axes()\n", "b = np.logspace(-2, 4, 101)\n", "hs = []\n", - "pids = sorted(np.unique(awkward.flatten(arrs_awk[\"ygen\"][\"typ\"])).tolist())\n", + "pids = sorted(np.unique(awkward.flatten(arrs_awk[\"ygen\"][\"pid\"])).tolist())\n", "colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n", "labels = []\n", "for pid in pids[::-1]:\n", - " pt_pid = awkward.to_numpy(awkward.to_numpy(awkward.flatten(arrs_awk[\"ygen\"][\"pt\"][arrs_awk[\"ygen\"][\"typ\"] == pid])))\n", + " pt_pid = awkward.to_numpy(awkward.to_numpy(awkward.flatten(arrs_awk[\"ygen\"][\"pt\"][arrs_awk[\"ygen\"][\"pid\"] == pid])))\n", " hs.append(np.histogram(pt_pid, bins=b))\n", " labels.append(CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n", "mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n", @@ -576,7 +756,7 @@ "plt.xscale(\"log\")\n", "plt.ylim(0, 1.2 * np.sum([h[0] for h in hs], axis=0).max())\n", "if sample == \"TTbar_14TeV_TuneCUETP8M1_cfi\":\n", - " plt.ylim(0, 1.5e6)\n", + " plt.ylim(0, 1e5)\n", "\n", "plt.ticklabel_format(style=\"sci\", axis=\"y\", scilimits=(0, 0))\n", "ax.yaxis.major.formatter._useMathText = True\n", @@ -588,6 +768,7 @@ "cms_label(ax)\n", "#sample_label(ax, sample, \", MLPF truth\")\n", "plt.xlim(10**-2, 10**4)\n", + "plt.show()\n", "plt.savefig(plot_outpath + \"truth_pt.pdf\", bbox_inches=\"tight\")" ] }, @@ -602,17 +783,17 @@ "ax = plt.axes()\n", "b = np.linspace(-6, 6, 41)\n", "hs = []\n", - "pids = sorted(np.unique(awkward.flatten(arrs_awk[\"ygen\"][\"typ\"])).tolist())\n", + "pids = sorted(np.unique(awkward.flatten(arrs_awk[\"ygen\"][\"pid\"])).tolist())\n", "colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n", "labels = []\n", "for pid in pids[::-1]:\n", - " pt_pid = awkward.to_numpy(awkward.flatten(arrs_awk[\"ygen\"][\"eta\"][arrs_awk[\"ygen\"][\"typ\"] == pid]))\n", + " pt_pid = awkward.to_numpy(awkward.flatten(arrs_awk[\"ygen\"][\"eta\"][arrs_awk[\"ygen\"][\"pid\"] == pid]))\n", " hs.append(np.histogram(pt_pid, bins=b))\n", " labels.append(CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n", "mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n", "plt.ylim(0, 1.5 * np.sum([h[0] for h in hs], axis=0).max())\n", "if sample == \"TTbar_14TeV_TuneCUETP8M1_cfi\":\n", - " plt.ylim(0, 1e6)\n", + " plt.ylim(0, 1e5)\n", "plt.ticklabel_format(style=\"sci\", axis=\"y\", scilimits=(0, 0))\n", "ax.yaxis.major.formatter._useMathText = True\n", "\n", @@ -625,6 +806,7 @@ "cms_label(ax)\n", "#sample_label(ax, sample, \", MLPF truth\")\n", "plt.xlim(-6, 6)\n", + "plt.show()\n", "plt.savefig(plot_outpath + \"truth_eta.pdf\", bbox_inches=\"tight\")" ] }, @@ -639,11 +821,11 @@ "ax = plt.axes()\n", "b = np.logspace(-2, 4, 101)\n", "hs = []\n", - "pids = sorted(np.unique(awkward.flatten(arrs_awk[\"ycand\"][\"typ\"])).tolist())\n", + "pids = sorted(np.unique(awkward.flatten(arrs_awk[\"ycand\"][\"pid\"])).tolist())\n", "colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n", "labels = []\n", "for pid in pids[::-1]:\n", - " pt_pid = awkward.to_numpy(awkward.flatten(arrs_awk[\"ycand\"][\"pt\"][arrs_awk[\"ycand\"][\"typ\"] == pid]))\n", + " pt_pid = awkward.to_numpy(awkward.flatten(arrs_awk[\"ycand\"][\"pt\"][arrs_awk[\"ycand\"][\"pid\"] == pid]))\n", " hs.append(np.histogram(pt_pid, bins=b))\n", " labels.append(CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n", "mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n", @@ -651,7 +833,7 @@ "plt.xscale(\"log\")\n", "plt.ylim(0, 1.2 * np.sum([h[0] for h in hs], axis=0).max())\n", "if sample == \"TTbar_14TeV_TuneCUETP8M1_cfi\":\n", - " plt.ylim(0, 1.5e6)\n", + " plt.ylim(0, 1e5)\n", "plt.ticklabel_format(style=\"sci\", axis=\"y\", scilimits=(0, 0))\n", "ax.yaxis.major.formatter._useMathText = True\n", "\n", @@ -662,6 +844,7 @@ "cms_label(ax)\n", "#sample_label(ax, sample, \", PF\")\n", "plt.xlim(10**-2, 10**4)\n", + "plt.show()\n", "plt.savefig(plot_outpath + \"pf_pt.pdf\", bbox_inches=\"tight\")" ] }, @@ -676,11 +859,11 @@ "ax = plt.axes()\n", "b = np.linspace(-6, 6, 41)\n", "hs = []\n", - "pids = sorted(np.unique(awkward.flatten(arrs_awk[\"ycand\"][\"typ\"])).tolist())\n", + "pids = sorted(np.unique(awkward.flatten(arrs_awk[\"ycand\"][\"pid\"])).tolist())\n", "colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n", "labels = []\n", "for pid in pids[::-1]:\n", - " pt_pid = awkward.to_numpy(awkward.flatten(arrs_awk[\"ycand\"][\"eta\"][arrs_awk[\"ycand\"][\"typ\"] == pid]))\n", + " pt_pid = awkward.to_numpy(awkward.flatten(arrs_awk[\"ycand\"][\"eta\"][arrs_awk[\"ycand\"][\"pid\"] == pid]))\n", " hs.append(np.histogram(pt_pid, bins=b))\n", " labels.append(CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n", "mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n", @@ -688,7 +871,7 @@ "# plt.xscale(\"log\")\n", "plt.ylim(0, 1.5 * np.sum([h[0] for h in hs], axis=0).max())\n", "if sample == \"TTbar_14TeV_TuneCUETP8M1_cfi\":\n", - " plt.ylim(0, 1e6)\n", + " plt.ylim(0, 1e5)\n", "plt.ticklabel_format(style=\"sci\", axis=\"y\", scilimits=(0, 0))\n", "ax.yaxis.major.formatter._useMathText = True\n", "\n", @@ -699,9 +882,20 @@ "cms_label(ax)\n", "#sample_label(ax, sample, \", PF\")\n", "plt.xlim(-6, 6)\n", + "plt.show()\n", "plt.savefig(plot_outpath + \"pf_eta.pdf\", bbox_inches=\"tight\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ce54f2f-a611-401d-b015-3baa1d7a1cdd", + "metadata": {}, + "outputs": [], + "source": [ + "arrs_flat[\"pythia\"]" + ] + }, { "cell_type": "code", "execution_count": null, @@ -714,11 +908,11 @@ " ax = plt.axes()\n", " b = np.logspace(-2, 5, 101)\n", " hs = []\n", - " pids = sorted(np.unique(awkward.flatten(arrs_flat[\"pythia\"][\"typ\"])).tolist())\n", + " pids = sorted(np.unique(awkward.flatten(arrs_flat[\"pythia\"][\"pid\"])).tolist())\n", " colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n", " labels = []\n", " for pid in pids[::-1]:\n", - " pt_pid = awkward.to_numpy(awkward.flatten(arrs_flat[\"pythia\"][\"pt\"][arrs_flat[\"pythia\"][\"typ\"] == pid]))\n", + " pt_pid = awkward.to_numpy(awkward.flatten(arrs_flat[\"pythia\"][\"pt\"][arrs_flat[\"pythia\"][\"pid\"] == pid]))\n", " hs.append(np.histogram(pt_pid, bins=b))\n", " labels.append(int(pid))\n", " mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n", @@ -730,6 +924,7 @@ " # plt.title(\"{}\\nMLPF truth\".format(sample))\n", " cms_label(ax)\n", " #sample_label(ax, sample, \", Pythia\")\n", + " plt.show()\n", " plt.savefig(plot_outpath + \"pythia_pt.pdf\", bbox_inches=\"tight\")" ] }, @@ -745,11 +940,11 @@ " ax = plt.axes()\n", " b = np.linspace(-6, 6, 101)\n", " hs = []\n", - " pids = sorted(np.unique(awkward.flatten(arrs_flat[\"pythia\"][\"typ\"])).tolist())\n", + " pids = sorted(np.unique(awkward.flatten(arrs_flat[\"pythia\"][\"pid\"])).tolist())\n", " colors = plt.cm.get_cmap(\"tab20c\", len(pids))\n", " labels = []\n", " for pid in pids[::-1]:\n", - " pt_pid = awkward.to_numpy(awkward.flatten(arrs_flat[\"pythia\"][\"eta\"][arrs_flat[\"pythia\"][\"typ\"] == pid]))\n", + " pt_pid = awkward.to_numpy(awkward.flatten(arrs_flat[\"pythia\"][\"eta\"][arrs_flat[\"pythia\"][\"pid\"] == pid]))\n", " hs.append(np.histogram(pt_pid, bins=b))\n", " labels.append(int(pid))\n", " mplhep.histplot(hs, stack=True, histtype=\"fill\", label=labels, color=colors.colors)\n", @@ -761,6 +956,7 @@ " # plt.title(\"{}\\nMLPF truth\".format(sample))\n", " cms_label(ax)\n", " #sample_label(ax, sample, \", Pythia\")\n", + " plt.show()\n", " plt.savefig(plot_outpath + \"pythia_eta.pdf\", bbox_inches=\"tight\")" ] }, @@ -776,11 +972,11 @@ " plt.figure()\n", " ax = plt.axes()\n", " plt.hist(\n", - " awkward.to_numpy(awkward.flatten(arrs_awk[\"ycand\"][\"pt\"][arrs_awk[\"ycand\"][\"typ\"] == pid])),\n", + " awkward.to_numpy(awkward.flatten(arrs_awk[\"ycand\"][\"pt\"][arrs_awk[\"ycand\"][\"pid\"] == pid])),\n", " bins=b, histtype=\"step\", lw=2, label=\"PF\"\n", " )\n", " plt.hist(\n", - " awkward.to_numpy(awkward.flatten(arrs_awk[\"ygen\"][\"pt\"][arrs_awk[\"ygen\"][\"typ\"] == pid])),\n", + " awkward.to_numpy(awkward.flatten(arrs_awk[\"ygen\"][\"pt\"][arrs_awk[\"ygen\"][\"pid\"] == pid])),\n", " bins=b,\n", " histtype=\"step\",\n", " lw=2,\n", @@ -793,6 +989,7 @@ " plt.xlabel(\"$p_T$ [GeV]\")\n", " cms_label(ax)\n", " #sample_label(ax, sample)\n", + " plt.show()\n", " plt.savefig(plot_outpath + \"pid{}_pt.pdf\".format(pid), bbox_inches=\"tight\")" ] }, @@ -808,13 +1005,13 @@ " plt.figure()\n", " ax = plt.axes()\n", " plt.hist(\n", - " awkward.flatten(arrs_awk[\"ycand\"][\"eta\"][arrs_awk[\"ycand\"][\"typ\"] == pid]),\n", - " weights=awkward.flatten(arrs_awk[\"ycand\"][\"e\"][arrs_awk[\"ycand\"][\"typ\"] == pid]),\n", + " awkward.flatten(arrs_awk[\"ycand\"][\"eta\"][arrs_awk[\"ycand\"][\"pid\"] == pid]),\n", + " weights=awkward.flatten(arrs_awk[\"ycand\"][\"e\"][arrs_awk[\"ycand\"][\"pid\"] == pid]),\n", " bins=b, histtype=\"step\", lw=2, label=\"PF\"\n", " )\n", " plt.hist(\n", - " awkward.flatten(arrs_awk[\"ygen\"][\"eta\"][arrs_awk[\"ygen\"][\"typ\"] == pid]),\n", - " weights=awkward.flatten(arrs_awk[\"ygen\"][\"e\"][arrs_awk[\"ygen\"][\"typ\"] == pid]),\n", + " awkward.flatten(arrs_awk[\"ygen\"][\"eta\"][arrs_awk[\"ygen\"][\"pid\"] == pid]),\n", + " weights=awkward.flatten(arrs_awk[\"ygen\"][\"e\"][arrs_awk[\"ygen\"][\"pid\"] == pid]),\n", " bins=b,\n", " histtype=\"step\",\n", " lw=2,\n", @@ -823,21 +1020,12 @@ " plt.title(CLASS_NAMES_CMS[CLASS_LABELS_CMS.index(pid)])\n", " plt.legend(ncol=1, loc=(0.68, 0.8))\n", " plt.xlabel(\"particle $\\eta$\")\n", - " cms_label(ax)\n", - " sample_label(ax, sample)\n", + " # cms_label(ax)\n", + " # sample_label(ax, sample)\n", + " plt.show()\n", " plt.savefig(plot_outpath + \"pid{}_eta.pdf\".format(pid), bbox_inches=\"tight\")" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "ca5207b1", - "metadata": {}, - "outputs": [], - "source": [ - "mask = arrs_flat[\"ygen\"][\"e\"] > 0 # & (np.abs(arrs_flat[\"ycand\"][\"e\"]-arrs_flat[\"ygen\"][\"e\"])<500)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -865,6 +1053,7 @@ " plt.yscale(\"log\")\n", " plt.xlabel(\"Pythia $\\sum E$ [GeV]\")\n", " plt.ylabel(\"MLPF truth $\\sum E$ [GeV]\")\n", + " plt.show()\n", " plt.savefig(plot_outpath + \"pythia_vs_mlpf_sume.pdf\", bbox_inches=\"tight\")" ] }, @@ -895,21 +1084,74 @@ " plt.yscale(\"log\")\n", " plt.xlabel(\"Pythia $\\sum E$ [GeV]\")\n", " plt.ylabel(\"PF $\\sum E$ [GeV]\")\n", + " plt.show()\n", " plt.savefig(plot_outpath + \"pythia_vs_pf_sume.pdf\", bbox_inches=\"tight\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "19b4a45d", + "id": "c4efcc82-9e35-4a50-aaaa-f04b57f3eadc", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "gen_pid = awkward.flatten(arrs_flat[\"ygen\"][\"pid\"][arrs_flat[\"Xelem\"][\"typ\"]==1])\n", + "cand_pid = awkward.flatten(arrs_flat[\"ycand\"][\"pid\"][arrs_flat[\"Xelem\"][\"typ\"]==1])\n", + "track_pt = awkward.flatten(arrs_flat[\"Xelem\"][\"pt\"][arrs_flat[\"Xelem\"][\"typ\"]==1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b950f4a-178d-459e-b43b-7ed2c9b91b96", + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure()\n", + "plt.hist(track_pt, bins=np.logspace(-2,3,100));\n", + "plt.xscale(\"log\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f5ee921-04c7-4691-b5ce-ee217b93d567", + "metadata": {}, + "outputs": [], + "source": [ + "bins = np.logspace(-1, 3, 20)\n", + "fracs_gen = []\n", + "fracs_cand = []\n", + "\n", + "for ibin in range(len(bins)-1):\n", + " b0 = bins[ibin]\n", + " b1 = bins[ibin+1]\n", + " msk = (track_pt >= b0) & (track_pt < b1)\n", + " frac_gen = np.sum(gen_pid[msk]!=0) / np.sum(msk)\n", + " frac_cand = np.sum(cand_pid[msk]!=0) / np.sum(msk)\n", + " fracs_gen.append(frac_gen)\n", + " fracs_cand.append(frac_cand)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdec0d58-32b5-4de0-b759-7d842b330bcd", + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure()\n", + "plt.plot(bins[:-1], fracs_gen, marker=\"o\")\n", + "plt.plot(bins[:-1], fracs_cand, marker=\"o\")\n", + "plt.xscale(\"log\")\n", + "plt.show()" + ] }, { "cell_type": "code", "execution_count": null, - "id": "689c787c-a5ab-4ddc-bd82-8e64392dbe25", + "id": "86838aaa-04ae-4e1e-9ba0-ab28164ea947", "metadata": {}, "outputs": [], "source": [] diff --git a/parameters/pytorch/pyg-clic.yaml b/parameters/pytorch/pyg-clic.yaml index f8bb73d1b..dd16021c4 100644 --- a/parameters/pytorch/pyg-clic.yaml +++ b/parameters/pytorch/pyg-clic.yaml @@ -10,7 +10,7 @@ num_epochs: 100 patience: 20 lr: 0.0001 lr_schedule: cosinedecay # constant, cosinedecay, onecycle -conv_type: gnn_lsh # gnn_lsh, attention, mamba, flashattention +conv_type: attention # gnn_lsh, attention, mamba, flashattention ntrain: ntest: nvalid: @@ -35,12 +35,12 @@ model: gnn_lsh: conv_type: gnn_lsh - embedding_dim: 256 - width: 256 - num_convs: 6 + embedding_dim: 512 + width: 512 + num_convs: 8 activation: "elu" # gnn-lsh specific parameters - bin_size: 256 + bin_size: 32 max_num_bins: 200 distance_dim: 128 layernorm: True @@ -50,16 +50,16 @@ model: attention: conv_type: attention - num_convs: 6 - dropout_ff: 0.0 + num_convs: 12 + dropout_ff: 0.1 dropout_conv_id_mha: 0.0 dropout_conv_id_ff: 0.0 dropout_conv_reg_mha: 0.0 dropout_conv_reg_ff: 0.0 activation: "relu" - head_dim: 16 + head_dim: 32 num_heads: 32 - attention_type: flash + attention_type: math use_pre_layernorm: True mamba: @@ -106,9 +106,9 @@ train_dataset: batch_size: 1 samples: clic_edm_ttbar_pf: - version: 2.0.0 + version: 2.1.0 clic_edm_qq_pf: - version: 2.0.0 + version: 2.1.0 valid_dataset: clic: @@ -116,12 +116,12 @@ valid_dataset: batch_size: 1 samples: clic_edm_ttbar_pf: - version: 2.0.0 + version: 2.1.0 clic_edm_qq_pf: - version: 2.0.0 + version: 2.1.0 test_dataset: clic_edm_ttbar_pf: - version: 2.0.0 + version: 2.1.0 clic_edm_qq_pf: - version: 2.0.0 + version: 2.1.0 diff --git a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml index 9504ce378..cfacab525 100644 --- a/parameters/pytorch/pyg-cms-ttbar-nopu.yaml +++ b/parameters/pytorch/pyg-cms-ttbar-nopu.yaml @@ -108,7 +108,7 @@ train_dataset: batch_size: 10 samples: cms_pf_ttbar_nopu: - version: 2.0.0 + version: 2.2.0 valid_dataset: cms: @@ -116,8 +116,8 @@ valid_dataset: batch_size: 10 samples: cms_pf_ttbar_nopu: - version: 2.0.0 + version: 2.2.0 test_dataset: cms_pf_ttbar_nopu: - version: 2.0.0 + version: 2.2.0 diff --git a/parameters/pytorch/pyg-cms.yaml b/parameters/pytorch/pyg-cms.yaml index 183a5f52a..68e160693 100644 --- a/parameters/pytorch/pyg-cms.yaml +++ b/parameters/pytorch/pyg-cms.yaml @@ -8,7 +8,7 @@ gpu_batch_multiplier: 1 load: num_epochs: 100 patience: 20 -lr: 0.0002 +lr: 0.0001 lr_schedule: cosinedecay # constant, cosinedecay, onecycle conv_type: attention ntrain: @@ -107,22 +107,16 @@ train_dataset: physical_pu: batch_size: 1 samples: - cms_pf_qcd: - version: 2.0.0 cms_pf_ttbar: - version: 2.0.0 + version: 2.2.0 valid_dataset: cms: physical_pu: batch_size: 1 samples: - cms_pf_qcd: - version: 2.0.0 cms_pf_ttbar: - version: 2.0.0 + version: 2.2.0 test_dataset: cms_pf_ttbar: - version: 2.0.0 - cms_pf_qcd: - version: 2.0.0 + version: 2.2.0 diff --git a/scripts/generate_tfds.sh b/scripts/generate_tfds.sh index 2aab3dda6..d8533b658 100755 --- a/scripts/generate_tfds.sh +++ b/scripts/generate_tfds.sh @@ -4,7 +4,7 @@ export KERAS_BACKEND=tensorflow export PYTHONPATH="mlpf:$PYTHONPATH" # T2_EE_Estonia -export IMG=/home/software/singularity/pytorch.simg:2024-07-08 +export IMG=/home/software/singularity/pytorch.simg:2024-08-18 export CMD="singularity exec -B /local -B /scratch/persistent $IMG tfds build " # Desktop @@ -14,9 +14,9 @@ export CMD="singularity exec -B /local -B /scratch/persistent $IMG tfds build " # export CMD="singularity exec -B /media/joosep/data --env PYTHONPATH=$PYTHONPATH $IMG tfds build " # CMS -# export DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets -# export MANUAL_DIR=/local/joosep/mlpf/cms/20240702_cptruthdef -# $CMD mlpf/heptfds/cms_pf/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_ttbar.log & +export DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets +export MANUAL_DIR=/local/joosep/mlpf/cms/20240823_simcluster +$CMD mlpf/heptfds/cms_pf/ttbar --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_ttbar.log & # $CMD mlpf/heptfds/cms_pf/qcd --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_qcd.log & # $CMD mlpf/heptfds/cms_pf/ztt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_ztt.log & # $CMD mlpf/heptfds/cms_pf/qcd_high_pt --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/pu55to75 --overwrite &> logs/tfds_qcd_high_pt.log & @@ -34,7 +34,7 @@ export CMD="singularity exec -B /local -B /scratch/persistent $IMG tfds build " # $CMD mlpf/heptfds/cms_pf/ttbar_nopu --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_ttbar_nopu.log & # $CMD mlpf/heptfds/cms_pf/qcd_nopu --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_qcd_nopu.log & # $CMD mlpf/heptfds/cms_pf/vbf_nopu --data_dir $DATA_DIR --manual_dir $MANUAL_DIR/nopu --overwrite &> logs/tfds_vbf_nopu.log & -# wait +wait # CLIC cluster-based # export DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets diff --git a/scripts/local_test_pyg.sh b/scripts/local_test_pyg.sh index 00fd3cbd0..6bbc0042b 100755 --- a/scripts/local_test_pyg.sh +++ b/scripts/local_test_pyg.sh @@ -9,8 +9,8 @@ mkdir -p local_test_data/TTbar_14TeV_TuneCUETP8M1_cfi/root cd local_test_data/TTbar_14TeV_TuneCUETP8M1_cfi/root #Only CMS-internal use is permitted by CMS rules! Do not use these MC simulation files otherwise! -wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/20240702_cptruthdef/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_100000.root -wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/20240702_cptruthdef/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_100001.root +wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/20240823_simcluster/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_100000.root +wget -q --no-check-certificate -nc https://jpata.web.cern.ch/jpata/mlpf/cms/20240823_simcluster/pu55to75/TTbar_14TeV_TuneCUETP8M1_cfi/root/pfntuple_100001.root cd ../../.. diff --git a/scripts/tallinn/a100/pytorch-clic.sh b/scripts/tallinn/a100/pytorch-clic.sh new file mode 100755 index 000000000..3d6e59a4f --- /dev/null +++ b/scripts/tallinn/a100/pytorch-clic.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH --partition gpu +#SBATCH --gres gpu:a100:1 +#SBATCH --mem-per-gpu 50G +#SBATCH -o logs/slurm-%x-%j-%N.out + +IMG=/home/software/singularity/pytorch.simg:2024-08-18 +cd ~/particleflow + +ulimit -n 10000 +singularity exec -B /scratch/persistent --nv \ + --env PYTHONPATH=hep_tfds \ + --env KERAS_BACKEND=torch \ + $IMG python3 mlpf/pyg_pipeline.py --dataset clic --gpus 1 \ + --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-clic.yaml \ + --train --conv-type attention --num-epochs 100 --gpu-batch-multiplier 256 --num-workers 4 --prefetch-factor 100 --checkpoint-freq 1 --comet --dtype bfloat16 diff --git a/scripts/tallinn/a100/pytorch-finetune.sh b/scripts/tallinn/a100/pytorch-finetune.sh new file mode 100644 index 000000000..9e39580c8 --- /dev/null +++ b/scripts/tallinn/a100/pytorch-finetune.sh @@ -0,0 +1,17 @@ +#!/bin/bash +#SBATCH --partition gpu +#SBATCH --gres gpu:mig:1 +#SBATCH --mem-per-gpu 60G +#SBATCH -o logs/slurm-%x-%j-%N.out + +IMG=/home/software/singularity/pytorch.simg:2024-07-08 +cd ~/particleflow + +env + +singularity exec -B /scratch/persistent --nv \ + --env PYTHONPATH=hep_tfds \ + --env KERAS_BACKEND=torch \ + $IMG python3.10 mlpf/pyg_pipeline.py --dataset cms --gpus 1 \ + --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms-finetune.yaml \ + --train --test --make-plots --conv-type attention --attention-type flash --gpu-batch-multiplier 4 --num-workers 1 --prefetch-factor 50 --dtype bfloat16 --checkpoint-freq 1 --load experiments/pyg-cms_20240804_095032_809397_finetune/checkpoints/checkpoint-13-19.827532.pth --num-epochs 50 diff --git a/scripts/tallinn/a100/pytorch-small-eval-clic.sh b/scripts/tallinn/a100/pytorch-small-eval-clic.sh old mode 100644 new mode 100755 index a10df0bda..a8a79ad82 --- a/scripts/tallinn/a100/pytorch-small-eval-clic.sh +++ b/scripts/tallinn/a100/pytorch-small-eval-clic.sh @@ -1,16 +1,16 @@ #!/bin/bash #SBATCH --partition gpu #SBATCH --gres gpu:mig:1 -#SBATCH --mem-per-gpu 60G +#SBATCH --mem-per-gpu 50G #SBATCH -o logs/slurm-%x-%j-%N.out -IMG=/home/software/singularity/pytorch.simg:2024-08-02 +IMG=/home/software/singularity/pytorch.simg:2024-08-18 cd ~/particleflow -WEIGHTS=experiments/pyg-clic_20240807_134034_168101/checkpoints/checkpoint-100-9.413720.pth +WEIGHTS=experiments/pyg-clic_20240830_114129_279460/checkpoints/checkpoint-11-8.095037.pth singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env KERAS_BACKEND=torch \ $IMG python3 mlpf/pyg_pipeline.py --dataset clic --gpus 1 \ --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-clic.yaml \ - --test --make-plots --gpu-batch-multiplier 200 --load $WEIGHTS --dtype bfloat16 + --test --make-plots --gpu-batch-multiplier 100 --load $WEIGHTS --dtype bfloat16 --ntest 10000 diff --git a/scripts/tallinn/a100/pytorch-small-eval-cms.sh b/scripts/tallinn/a100/pytorch-small-eval-cms.sh old mode 100644 new mode 100755 index b85fa642f..b91a923d3 --- a/scripts/tallinn/a100/pytorch-small-eval-cms.sh +++ b/scripts/tallinn/a100/pytorch-small-eval-cms.sh @@ -4,13 +4,13 @@ #SBATCH --mem-per-gpu 50G #SBATCH -o logs/slurm-%x-%j-%N.out -IMG=/home/software/singularity/pytorch.simg:2024-08-02 +IMG=/home/software/singularity/pytorch.simg:2024-08-18 cd ~/particleflow -WEIGHTS=experiments/pyg-cms-ttbar-nopu_20240815_233931_332621/checkpoints/checkpoint-17-17.282402.pth +WEIGHTS=experiments/pyg-cms_20240901_194940_868487/checkpoints/checkpoint-02-18.710537.pth singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env KERAS_BACKEND=torch \ $IMG python mlpf/pyg_pipeline.py --dataset cms --gpus 1 \ - --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms-ttbar-nopu.yaml \ - --test --make-plots --gpu-batch-multiplier 5 --load $WEIGHTS --ntest 10000 --dtype bfloat16 + --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \ + --test --make-plots --gpu-batch-multiplier 10 --load $WEIGHTS --ntest 5000 --dtype bfloat16 diff --git a/scripts/tallinn/a100/pytorch.sh b/scripts/tallinn/a100/pytorch.sh index db91084de..8a0535da2 100755 --- a/scripts/tallinn/a100/pytorch.sh +++ b/scripts/tallinn/a100/pytorch.sh @@ -1,10 +1,10 @@ #!/bin/bash #SBATCH --partition gpu #SBATCH --gres gpu:a100:1 -#SBATCH --mem-per-gpu 200G +#SBATCH --mem-per-gpu 100G #SBATCH -o logs/slurm-%x-%j-%N.out -IMG=/home/software/singularity/pytorch.simg:2024-08-02 +IMG=/home/software/singularity/pytorch.simg:2024-08-18 cd ~/particleflow ulimit -n 10000 @@ -12,5 +12,6 @@ singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env KERAS_BACKEND=torch \ $IMG python3 mlpf/pyg_pipeline.py --dataset cms --gpus 1 \ - --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms-ttbar-nopu.yaml \ - --train --test --make-plots --num-epochs 20 --conv-type attention --gpu-batch-multiplier 10 --num-workers 4 --prefetch-factor 100 --checkpoint-freq 1 --comet --ntrain 50000 --nvalid 5000 --ntest 5000 + --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-cms.yaml \ + --train --test --make-plots --num-epochs 100 --conv-type attention \ + --gpu-batch-multiplier 10 --num-workers 4 --prefetch-factor 100 --checkpoint-freq 1 --comet diff --git a/scripts/tallinn/cmssw-el8.sh b/scripts/tallinn/cmssw-el8.sh index f2d0f1766..c77d6ac75 100755 --- a/scripts/tallinn/cmssw-el8.sh +++ b/scripts/tallinn/cmssw-el8.sh @@ -6,4 +6,4 @@ source /cvmfs/cms.cern.ch/cmsset_default.sh export UNPACKED_IMAGE=/cvmfs/singularity.opensciencegrid.org/cmssw/cms\:rhel8-x86_64/ -cmssw-el8 -B /cms -B /local -B /scratch/persistent -B /scratch/local --command-to-run $@ +cmssw-el8 -B /root -B /cms -B /local -B /scratch/persistent -B /scratch/local --command-to-run $@ diff --git a/scripts/tallinn/rtx/clic.sh b/scripts/tallinn/rtx/clic.sh index 024936422..9bb08dc26 100755 --- a/scripts/tallinn/rtx/clic.sh +++ b/scripts/tallinn/rtx/clic.sh @@ -1,16 +1,16 @@ #!/bin/bash #SBATCH --partition gpu -#SBATCH --gres gpu:rtx:4 +#SBATCH --gres gpu:rtx:8 #SBATCH --mem-per-gpu 40G #SBATCH -o logs/slurm-%x-%j-%N.out -IMG=/home/software/singularity/tf-2.14.0.simg +IMG=/home/software/singularity/pytorch.simg:2024-08-02 cd ~/particleflow -#TF training +ulimit -n 10000 singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ - --env TFDS_DATA_DIR=/scratch/persistent/joosep/tensorflow_datasets \ - $IMG python3.10 mlpf/pipeline.py train -c parameters/tensorflow/clic.yaml \ - --plot-freq 1 \ - --batch-multiplier 0.5 + --env KERAS_BACKEND=torch \ + $IMG python3 mlpf/pyg_pipeline.py --dataset clic --gpus 8 \ + --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-clic.yaml \ + --train --conv-type attention --attention-type math --num-epochs 200 --gpu-batch-multiplier 32 --num-workers 4 --prefetch-factor 100 --checkpoint-freq 1 --comet diff --git a/scripts/tallinn/rtx/pytorch.sh b/scripts/tallinn/rtx/pytorch.sh index b029fa7c4..0ce0280f5 100755 --- a/scripts/tallinn/rtx/pytorch.sh +++ b/scripts/tallinn/rtx/pytorch.sh @@ -4,11 +4,12 @@ #SBATCH --mem-per-gpu 40G #SBATCH -o logs/slurm-%x-%j-%N.out -IMG=/home/software/singularity/pytorch.simg:2024-07-03 +IMG=/home/software/singularity/pytorch.simg:2024-08-18 +ulimit -n 10000 singularity exec -B /scratch/persistent --nv \ --env PYTHONPATH=hep_tfds \ --env KERAS_BACKEND=torch \ - $IMG python3.10 mlpf/pyg_pipeline.py --dataset clic --gpus 4 \ + $IMG python3 mlpf/pyg_pipeline.py --dataset clic --gpus 4 \ --data-dir /scratch/persistent/joosep/tensorflow_datasets --config parameters/pytorch/pyg-clic.yaml \ - --train --test --make-plots --conv-type attention --gpu-batch-multiplier 10 --num-workers 1 --prefetch-factor 10 --attention-type math --dtype float32 + --train --test --make-plots --conv-type attention --gpu-batch-multiplier 10 --num-workers 2 --prefetch-factor 100 --attention-type math --dtype float32