Skip to content

Commit

Permalink
adding ATLAS_PH-8TEV_XSEC
Browse files Browse the repository at this point in the history
  • Loading branch information
jacoterh committed Dec 10, 2024
1 parent e6b83fd commit ceec648
Show file tree
Hide file tree
Showing 9 changed files with 6,642 additions and 0 deletions.
80 changes: 80 additions & 0 deletions nnpdf_data/nnpdf_data/commondata/ATLAS_PH_8TEV/data_XSEC.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
data_central:
- 1.03513740e+06
- 301090.3
- 115103.5
- 5.02673050e+04
- 25408.89
- 1.37034250e+04
- 6354.712
- 2535.047
- 1090.0
- 482.427
- 2.34292500e+02
- 98.10972
- 34.16238
- 1.40569950e+01
- 6.54196000e+00
- 2.842982
- 1.13152550e+00
- 4.04777250e-01
- 1.38166000e-01
- 4.35520400e-02
- 9.32802750e-03
- 6.11366600e-04
- 1.34977920e+06
- 3.90347400e+05
- 144561.6
- 6.62355050e+04
- 32981.85
- 17705.31
- 8203.69
- 3.23659800e+03
- 1.39020850e+03
- 616.616
- 3.00503350e+02
- 1.25231250e+02
- 43.1
- 16.54522
- 7.540344
- 3.071376
- 1.160232
- 3.81331500e-01
- 1.23733400e-01
- 2.95970400e-02
- 7.22349300e-03
- 444320.5
- 133886.1
- 48262.66
- 2.15118250e+04
- 1.07101650e+04
- 5.78355950e+03
- 2.69390050e+03
- 1015.716
- 438.5913
- 188.5275
- 8.93689600e+01
- 34.6956
- 1.08634850e+01
- 3.73744
- 1.495224
- 5.13887150e-01
- 1.24847350e-01
- 2.62151850e-02
- 9.49130750e+05
- 283815.4
- 1.04010400e+05
- 4.45961600e+04
- 2.15460000e+04
- 11787.02
- 5.37026850e+03
- 2.04784750e+03
- 828.8342
- 331.2862
- 151.3236
- 54.30558
- 14.10841
- 4.164325
- 1.35155250e+00
- 3.84890850e-01
- 7.19043450e-02
- 1.07784000e-02
259 changes: 259 additions & 0 deletions nnpdf_data/nnpdf_data/commondata/ATLAS_PH_8TEV/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
import pathlib

import numpy as np
import pandas as pd
import yaml

from nnpdf_data.filter_utils.utils import prettify_float

yaml.add_representer(float, prettify_float)

MT_VALUE = 172.5
SQRT_S = 8_000.0

from nnpdf_data.filter_utils.utils import symmetrize_errors as se


def load_yaml(table_id: int, version: int = 1) -> dict:
"""Load the HEP data table in yaml format.
Parameters
----------
table_id: int
table ID number
Returns
-------
dict:
ditionary containing the table contents
"""
filename = f"HEPData-ins1457605-v{version}-Table_{table_id}"
table = pathlib.Path(f"./rawdata/{filename}.yaml")

return yaml.safe_load(table.read_text())


def get_kinematics(hepdata: dict, bin_index: list = [], indx: int = 0, mid_rap=None) -> list:
"""Read the version and list of tables from metadata.
Parameters
----------
hepdata: dict
dictionary containing all data info
bin_index: list
list of Non-empty bin index
indx: int
Column index from which to read, default=0
Returns
-------
kinematics: list
kinematic info
"""
bins = hepdata["independent_variables"][indx]["values"]

kinematics = []
for i in bin_index:
min_et, max_et = bins[i]["low"], bins[i]["high"]

kin_value = {
"eta": {"min": None, "mid": mid_rap, "max": None},
"ET": {"min": None, "mid": ((min_et + max_et) / 2), "max": None},
"sqrts": {"min": None, "mid": SQRT_S, "max": None},
}
kinematics.append(kin_value)

return kinematics


def get_data_values(hepdata: dict, bin_index: list, indx: int = 0) -> list:
"""Extract the central values from the HepData yaml file.
Parameters
----------
hepdata: dict
dictionary containing all data info
bin_index: list
Bin indices that must be parsed
indx: int
Column index from which to read the central value, default=0
Returns
-------
list:
list of dictionaries whose contents are the central values
"""
central = hepdata["dependent_variables"][indx]["values"]
return np.array([central[i]["value"] for i in bin_index])


def get_errors(hepdata: dict, bin_index: list) -> dict:
"""
Extract the uncertainties from hepdata and computes the shift of the central value in case of
asymmetric uncertainties
Parameters
----------
hepdata: dict
Hepdata yaml file loaded as dictionary
bin_index: list
Bin indices that must be parsed
Returns
-------
dict:
Dictionary containing the errors (as pandas DataFrame) and shifts of central values
"""
# parse the systematics
central_values = [] # relevant for asymmetric uncertainties
df_errors = pd.DataFrame()
for i, bin in enumerate(hepdata["dependent_variables"][0]["values"]):

error_sources = []
shift_cv = 0
error_names = []
for source in bin["errors"]:
error_names.append(source["label"])
if source["label"] == "stat":
error_sources.append(source["symerror"])
elif "asymerror" in source:
delta_min = float(source["asymerror"]["minus"])
delta_plus = float(source["asymerror"]["plus"])
se_delta, se_sigma = se(delta_plus, delta_min)
error_sources.append(se_sigma)
shift_cv += se_delta
elif "symerror" in source:
se_sigma = float(source["symerror"])
error_sources.append(se_sigma)
df_bin = pd.DataFrame([error_sources], columns=error_names, index=[f"bin {i}"])
df_errors = pd.concat([df_errors, df_bin])
cv_i = bin["value"] + shift_cv
central_values.append(cv_i)

# convert to fb

df_errors = df_errors * 1e3
central_values = np.array(central_values) * 1e3

return central_values, df_errors


def format_uncertainties(uncs: dict) -> list:
"""Format the uncertainties to be dumped into the yaml file.
Parameters
----------
uncs: dict
Dictionary containing the various source of uncertainties
Returns
-------
list:
list of dictionaries whose elements are the various errors
"""

combined_errors = []
n_bins = uncs["systematics"].index.str.startswith("bin").sum()
for i in range(n_bins):
errors = {}
if "statistics" in uncs:
errors["stat"] = uncs["statistics"].loc[f"bin {i}"].values.item()
for j, unc in enumerate(uncs["systematics"].loc[f"bin {i}"].values):
errors[f"sys_corr_{j + 1}"] = float(unc)

combined_errors.append(errors)

return combined_errors


def dump_commondata(kinematics: list, data: list, errors: dict, obs: str) -> None:
"""Function that generates and writes the commondata files.
Parameters
----------
kinematics: list
list containing the kinematic values
data: list
list containing the central values
errors: dict
Dictionary containing the different errors
obs: str
Name to append to the file names
"""

if "statistics" in errors:
error_definition = {
"stat": {
"description": "Uncorrelated statistical uncertainties",
"treatment": errors["statistics"].loc["treatment"].iloc[0],
"type": errors["statistics"].loc["type"].iloc[0],
}
}
else:
error_definition = {}

n_sys = errors["systematics"].shape[1]
for i in range(n_sys):

error_definition[f"sys_corr_{i + 1}"] = {
"description": errors["systematics"].columns[i],
"treatment": errors["systematics"].loc["treatment"].iloc[i],
"type": errors["systematics"].loc["type"].iloc[i],
}

errors_formatted = format_uncertainties(errors)
with open(f"data_{obs}.yaml", "w") as file:
yaml.dump({"data_central": data.tolist()}, file, sort_keys=False)

with open(f"kinematics_{obs}.yaml", "w") as file:
yaml.dump({"bins": kinematics}, file, sort_keys=False)

with open(f"uncertainties_{obs}.yaml", "w") as file:
yaml.dump(
{"definitions": error_definition, "bins": errors_formatted}, file, sort_keys=False
)


def main_filter() -> None:
"""
Main function that reads the HepData yaml files and generates the commondata files
"""

yaml_content_data = [load_yaml(table_id=i, version=1) for i in range(1, 5)]
uncertainties_all = pd.DataFrame()
central_values_all = np.array([])
kinematics_all = []
n_datapoints = [22, 21, 18, 18]
mid_rapidities = [0.3, 0.985, 1.685, 2.09]
for i, yaml_content in enumerate(yaml_content_data):
kinematics = get_kinematics(
yaml_content, bin_index=range(n_datapoints[i]), mid_rap=mid_rapidities[i]
)
central_values, uncertainties = get_errors(yaml_content, bin_index=range(n_datapoints[i]))
uncertainties_all = pd.concat([uncertainties_all, uncertainties])
central_values_all = np.concatenate([central_values_all, central_values])
kinematics_all += kinematics

uncertainties_all.index = [f"bin {i}" for i in range(uncertainties_all.shape[0])]

n_sources = uncertainties_all.shape[1]
sys_types = {
"treatment": ["ADD"] + ["MULT"] * (n_sources - 1),
"type": ["UNCORR"] * (n_sources - 1) + ["ATLASLUMI15"],
}
sys_types_df = pd.DataFrame(sys_types, index=uncertainties_all.columns).T
df_errors = pd.concat([sys_types_df, uncertainties_all])

errors = {"statistics": df_errors.iloc[:, [0]], "systematics": df_errors.iloc[:, 1:]}

dump_commondata(kinematics_all, central_values_all, errors, obs="XSEC")

return


if __name__ == "__main__":
main_filter()
Loading

0 comments on commit ceec648

Please sign in to comment.