Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce inconsistent fit closure test data #2180

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
"""
This module contains the InconsistentCommonData class which is meant to have all the
methods needed in order to introduce an inconsistency within a Closure Test.
"""

import yaml
import dataclasses
from validphys.coredata import CommonData
import pandas as pd


@dataclasses.dataclass(eq=False)
class InconsistentCommonData(CommonData):
"""
Class that inherits all of the methods
of coredata.CommonData class.

This class is meant to have all the
methods needed in order to introduce
an inconsistency within a Closure Test.
"""

setname: str
ndata: int
commondataproc: str
nkin: int
nsys: int
commondata_table: pd.DataFrame = dataclasses.field(repr=False)
systype_table: pd.DataFrame = dataclasses.field(repr=False)
systematics_table: pd.DataFrame = dataclasses.field(default=None, repr=False)
_systematic_errors: any = dataclasses.field(default=None, init=False)

@property
def systematic_errors(self):
"""
Overrides the systematic_errors method of the CommonData class.

This is done in order to allow the systematic_errors to be a property
and hence to be able to assign values to it (setter).
"""
if self._systematic_errors is None:
return super().systematic_errors()
return self._systematic_errors

@systematic_errors.setter
def systematic_errors(self, value):
# Define the setter to allow assignment to systematic_errors
self._systematic_errors = value

def select_systype_table_indices(self, treatment_names, names_uncertainties):
"""
Is used to get the indices of the systype_table that correspond to the
intersection of the treatment_names and names_uncertainties lists.

Parameters
----------
treatment_names : list
list of the names of the treatments that should be selected
possible values are: MULT, ADD

names_uncertainties : list
list of the names of the uncertainties that should be selected
possible values are: CORR, UNCORR, THEORYCORR, THEORYUNCORR, SPECIAL
SPECIAL is used for intra-dataset systematics

Returns
-------
systype_tab.index : pd.Index
"""
# check that names_uncertainties only contains either CORR, UNCORR, THEORYCORR, THEORYUNCORR or SPECIAL
# if not raise an error
if not all(
name in ["CORR", "UNCORR", "THEORYCORR", "THEORYUNCORR", "SPECIAL"]
for name in names_uncertainties
):
raise ValueError(
"names_uncertainties should only contain either CORR, UNCORR, THEORYCORR, THEORYUNCORR or SPECIAL"
)

# if "SPECIAL", then we need to select the intra-dataset systematics
if "SPECIAL" in names_uncertainties:
# avoid circular import error
from validphys.covmats import INTRA_DATASET_SYS_NAME

# note: | operator allows to extend the condition so as to also include the names_uncertainties
systype_tab = self.systype_table[
(self.systype_table["treatment"].isin(treatment_names))
& (
~self.systype_table["name"].isin(INTRA_DATASET_SYS_NAME)
| self.systype_table["name"].isin(
[name for name in names_uncertainties if name != "SPECIAL"]
)
)
]

else:
systype_tab = self.systype_table[
(self.systype_table["treatment"].isin(treatment_names))
& (self.systype_table["name"].isin(names_uncertainties))
]

return systype_tab.index

def rescale_systematics(self, treatment_names, names_uncertainties, sys_rescaling_factor):
"""
Rescale the columns of the systematic_errors() that are included in the
the names_uncertainties list. And return the rescaled table.

Parameters
----------
treatment_names : list
list of the names of the treatments that should be rescaled
possible values are: MULT, ADD

names_uncertainties : list
list of the names of the uncertainties that should be rescaled
possible values are: CORR, UNCORR, THEORYCORR, THEORYUNCORR, SPECIAL
SPECIAL is used for intra-dataset systematics

sys_rescaling_factor : float
factor by which the systematics should be rescaled

Returns
-------
self.systematics_table : pd.DataFrame
"""

sys_table = self.systematic_errors.copy()

# select the columns of the systematics_table that should be rescaled
systype_idx = self.select_systype_table_indices(
treatment_names=treatment_names, names_uncertainties=names_uncertainties
)

# rescale columns of the systematics_table that are included in the index systype_idx
sys_table.iloc[:, systype_idx - 1] *= sys_rescaling_factor

return sys_table

def process_commondata(
self, treatment_names, names_uncertainties, sys_rescaling_factor, inconsistent_datasets
):
"""
returns a commondata instance
with modified systematics.
Note that if commondata.setname
is not within the inconsistent_datasets or if both ADD and
MULT are False, then the commondata object
will not be modified.

Parameters
----------
treatment_names : list
list of the names of the treatments that should be rescaled
possible values are: MULT, ADD

names_uncertainties : list
list of the names of the uncertainties that should be rescaled
possible values are: CORR, UNCORR, THEORYCORR, THEORYUNCORR, SPECIAL
SPECIAL is used for intra-dataset systematics

sys_rescaling_factor : float, int

inconsistent_datasets : list
list of the datasets for which an inconsistency should be introduced

Returns
-------
validphys.inconsistent_ct.InconsistentCommonData
"""
new_commondata = self

if not self.setname in inconsistent_datasets:
return self

# needs setter to allow assignment to systematic_errors
new_commondata.systematic_errors = self.rescale_systematics(
treatment_names, names_uncertainties, sys_rescaling_factor
)

return new_commondata

def export_uncertainties(self, buffer):
"""
Same as the export_uncertainties method of the CommonData class.
The only difference is that systematic_errors is now a property of the class
and not a method.
"""
definitions = {}
for idx, row in self.systype_table.iterrows():
if row["name"] != "SKIP":
definitions[f"sys_{idx}"] = {"treatment": row["treatment"], "type": row["name"]}

# Order the definitions by treatment as ADD, MULT
# TODO: make it so that it corresponds to the original order exactly
sorted_definitions = {
k: v for k, v in sorted(definitions.items(), key=lambda item: item[1]["treatment"])
}
bins = []

for idx, row in self.systematic_errors.iterrows():
tmp = {"stat": float(self.stat_errors[idx])}
# Hope things come in the right order...
for key_name, val in zip(sorted_definitions, row):
tmp[key_name] = float(val)

bins.append(tmp)

sorted_definitions["stat"] = {
"description": "Uncorrelated statistical uncertainties",
"treatment": "ADD",
"type": "UNCORR",
}
ret = {"definitions": sorted_definitions, "bins": bins}
yaml.safe_dump(ret, buffer)
35 changes: 34 additions & 1 deletion validphys2/src/validphys/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,33 @@ def parse_dataset_input(self, dataset: Mapping):
variant=variant,
)

def parse_inconsistent_data_settings(self, settings):
"""
Parse the inconsistent data settings from the yaml file.
"""
known_keys = {
"treatment_names",
"names_uncertainties",
"inconsistent_datasets",
"sys_rescaling_factor",
}

kdiff = settings.keys() - known_keys
for k in kdiff:
log.warning(
ConfigError(f"Key '{k}' in inconsistent_data_settings not known.", k, known_keys)
)

ict_data_settings = {}

ict_data_settings["treatment_names"] = settings.get("treatment_names", [])
ict_data_settings["names_uncertainties"] = settings.get("names_uncertainties", [])

ict_data_settings["inconsistent_datasets"] = settings.get("inconsistent_datasets", [])
ict_data_settings["sys_rescaling_factor"] = settings.get("sys_rescaling_factor", 1)

return ict_data_settings

def parse_use_fitcommondata(self, do_use: bool):
"""Use the commondata files in the fit instead of those in the data
directory."""
Expand Down Expand Up @@ -1716,7 +1743,9 @@ def produce_scale_variation_theories(self, theoryid, point_prescription):
return {"theoryids": NSList(theoryids, nskey="theoryid")}

@configparser.explicit_node
def produce_filter_data(self, fakedata: bool = False, theorycovmatconfig=None):
def produce_filter_data(
self, fakedata: bool = False, theorycovmatconfig=None, inconsistent_fakedata: bool = False
):
"""Set the action used to filter the data to filter either real or
closure data. If the closure data filter is being used and if the
theory covariance matrix is not being closure tested then filter
Expand All @@ -1735,6 +1764,10 @@ def produce_filter_data(self, fakedata: bool = False, theorycovmatconfig=None):
"Generating closure test data which samples from the theory "
"covariance matrix has not been implemented yet."
)
elif inconsistent_fakedata:
log.warning("Using filter for inconsistent closure data")
return validphys.filters.filter_inconsistent_closure_data_by_experiment

return validphys.filters.filter_closure_data_by_experiment

@configparser.explicit_node
Expand Down
1 change: 1 addition & 0 deletions validphys2/src/validphys/coredata.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,7 @@ def export_uncertainties(self, buffer):
k: v for k, v in sorted(definitions.items(), key=lambda item: item[1]["treatment"])
}
bins = []

for idx, row in self.systematic_errors().iterrows():
tmp = {"stat": float(self.stat_errors[idx])}
# Hope things come in the right order...
Expand Down
Loading