Gen/Sim training dataset (#100)

new gen/sim dataset based on CaloParticles+TrackingParticles, use PCGrad for gen training
jpata · Apr 28, 2022 · 1d35056 · 1d35056
1 parent d2bc8ba
commit 1d35056
Show file tree

Hide file tree

Showing 37 changed files with 2,824 additions and 1,101 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -19,8 +19,8 @@ jobs:
           sudo python3 -m pip install --upgrade pip
           sudo python3 -m pip install --upgrade setuptools
           sudo python3 -m pip install tensorflow==2.6 setGPU \
-            sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods \
-            awkward0 keras-tuner networkx \
+            sklearn matplotlib mplhep pandas scipy uproot \
+            awkward vector keras-tuner networkx \
             tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 \
             tqdm click tensorflow-datasets 'ray[default]'==1.6.0 'ray[tune]==1.6.0' \
             tf-models-official tensorflow-text \
@@ -41,8 +41,8 @@ jobs:
           sudo python3 -m pip install --upgrade pip
           sudo python3 -m pip install --upgrade setuptools
           sudo python3 -m pip install tensorflow==2.6 setGPU \
-            sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods \
-            awkward0 keras-tuner networkx \
+            sklearn matplotlib mplhep pandas scipy uproot \
+            awkward vector keras-tuner networkx \
             tensorflow-probability==0.12.2 tensorflow-addons==0.13.0 \
             tqdm click tensorflow-datasets 'ray[default]'==1.6.0 'ray[tune]'==1.6.0 \
             tf-models-official tensorflow-text \
@@ -52,22 +52,3 @@ jobs:
           git submodule update
       - name: Run CMS TF model using the pipeline
         run: ./scripts/local_test_cms_pipeline.sh
-
-  delphes-pytorch:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - name: Install python deps
-        run: |
-          sudo apt install python3 python3-pip wget
-          sudo python3 -m pip install --upgrade pip
-          sudo python3 -m pip install --upgrade setuptools
-          sudo python3 -m pip install setGPU sklearn matplotlib mplhep pandas scipy uproot3 uproot3-methods awkward0 networkx
-          sudo python3 -m pip install torch==1.6.0 torchvision==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
-          sudo python3 -m pip install --no-index torch-scatter -f https://pytorch-geometric.com/whl/torch-1.6.0+cpu.html
-          sudo python3 -m pip install --no-index torch-sparse -f https://pytorch-geometric.com/whl/torch-1.6.0+cpu.html
-          sudo python3 -m pip install --no-index torch-cluster -f https://pytorch-geometric.com/whl/torch-1.6.0+cpu.html
-          sudo python3 -m pip install --no-index torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.6.0+cpu.html
-          sudo python3 -m pip install torch-geometric comet_ml
-      - name: Run Delphes Pytorch model
-        run: ./scripts/local_test_delphes_pytorch.sh
diff --git a/README_cms.md b/README_cms.md
@@ -4,45 +4,20 @@
 #get the code
 git clone https://github.com/jpata/particleflow.git
 cd particleflow
-git checkout v1.4
 
 git submodule init
 git submodule update
 
 #Download the training datasets, about 60GB
-rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/tensorflow_datasets ~/
+rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms data/
 
-#Run the training, multi-GPU support on the same machine is available, specify explicitly the GPUs you want to use
-CUDA_VISIBLE_DEVICES=... python3 mlpf/pipeline.py train -c parameters/cms.yaml
-```
-# Baseline CMS MLPF model
-
-The current model (exported .onnx, SavedModel as .pb, training history, evaluation output) is available at
-```
-rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/models/cms/cms_20210917_142344_403761.gpu0.local.tar.xz ./
-```
+#Convert the training dataset to TFDS, this will require about 370GB free space in ~/tensorflow_datasets
+tfds build hep_tfds/heptfds/cms_pf/ttbar --data_dir ~/tensorflow_datasets --manual_dir ./data/cms --overwrite
+tfds build hep_tfds/heptfds/cms_pf/qcd --data_dir ~/tensorflow_datasets --manual_dir ./data/cms --overwrite
+tfds build hep_tfds/heptfds/cms_pf/ztt --data_dir ~/tensorflow_datasets --manual_dir ./data/cms --overwrite
 
-# Dataset creation
-
-The following example generates a small training sample using CMSSW
-```bash
-cd mlpf/data
-./run_gen.sh
-```
-Note that `pu_files.txt` and a corresponding CMSSW release must exist locally. Batch submission of the generator jobs is dependent on the local batch system and is left as an exercise to the reader.
-
-Generate TFRecord datasets from the pickle files
-```bash
-mkdir -p data
-rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/TTbar* data/
-rsync -r --progress lxplus.cern.ch:/eos/user/j/jpata/mlpf/cms/Single* data/
-tfds build ./hep_tfds/heptfds/cms_pf/ttbar --manual_dir data
-tfds build ./hep_tfds/heptfds/cms_pf/singlepi --manual_dir data
-tfds build ./hep_tfds/heptfds/cms_pf/singlepi0 --manual_dir data
-tfds build ./hep_tfds/heptfds/cms_pf/singleele --manual_dir data
-tfds build ./hep_tfds/heptfds/cms_pf/singlemu --manual_dir data
-tfds build ./hep_tfds/heptfds/cms_pf/singlegamma --manual_dir data
-tfds build ./hep_tfds/heptfds/cms_pf/singletau --manual_dir data
+#Run the training, multi-GPU support on the same machine is available, specify explicitly the GPUs you want to use
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 mlpf/pipeline.py train -c parameters/cms-gen.yaml
 ```
 
 ## Older presentations in CMS

diff --git a/clic/process_data.sh b/clic/process_data.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+source /opt/hepsim.sh
+source /opt/jas4pp.sh
+fpad clic/dumper.py $1
diff --git a/hep_tfds b/hep_tfds
diff --git a/mlpf/data/genjob.sh b/mlpf/data/genjob.sh
@@ -2,7 +2,7 @@
 set -e
 set -x
 
-CMSSWDIR=/home/joosep/reco/mlpf/CMSSW_12_1_0_pre3
+CMSSWDIR=/home/joosep/reco/mlpf/CMSSW_12_3_0_pre6
 MLPF_PATH=/home/joosep/particleflow/
 
 #seed must be greater than 0
@@ -14,7 +14,7 @@ mkdir -p $WORKDIR
 
 PILEUP=NoPileUp
 
-N=200
+N=1000
 
 env
 source /cvmfs/cms.cern.ch/cmsset_default.sh
@@ -63,5 +63,6 @@ cmsRun step2_phase1_new.py
 cmsRun step3_phase1_new.py
 cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py
 mv pfntuple.root pfntuple_${SEED}.root
-python3 ${MLPF_PATH}/mlpf/data/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table --events-per-file -1
-rm step*.root
+python3 ${MLPF_PATH}/mlpf/data/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table
+bzip2 -z pfntuple_${SEED}.pkl
+#rm step*.root
diff --git a/mlpf/data/genjob_pu.sh b/mlpf/data/genjob_pu.sh
@@ -2,7 +2,7 @@
 set -e
 set -x
 
-CMSSWDIR=/home/joosep/reco/mlpf/CMSSW_12_1_0_pre3
+CMSSWDIR=/home/joosep/reco/mlpf/CMSSW_12_3_0_pre6
 MLPF_PATH=/home/joosep/particleflow/
 
 #seed must be greater than 0
@@ -13,9 +13,9 @@ WORKDIR=`pwd`/$SAMPLE/$SEED
 mkdir -p $WORKDIR
 
 PILEUP=Run3_Flat55To75_PoissonOOTPU
-PILEUP_INPUT=filelist:${MLPF_PATH}/mlpf/data/pu_files.txt
+PILEUP_INPUT=filelist:${MLPF_PATH}/mlpf/data/pu_files_local.txt
 
-N=10
+N=100
 
 env
 source /cvmfs/cms.cern.ch/cmsset_default.sh
@@ -65,5 +65,6 @@ cmsRun step2_phase1_new.py
 cmsRun step3_phase1_new.py
 cmsRun $CMSSWDIR/src/Validation/RecoParticleFlow/test/pfanalysis_ntuple.py
 mv pfntuple.root pfntuple_${SEED}.root
-python3 ${MLPF_PATH}/mlpf/data/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table --events-per-file -1
-rm step*.root
+python3 ${MLPF_PATH}/mlpf/data/postprocessing2.py --input pfntuple_${SEED}.root --outpath ./ --save-normalized-table
+bzip2 -z pfntuple_${SEED}.pkl
+#rm step*.root
+61 −0		heptfds/cms_pf/qcd.py
+3 −2		heptfds/cms_pf/ttbar.py
+3 −2		heptfds/cms_pf/ztt.py
+23 −11		heptfds/cms_utils.py