NNPDF · comane · Oct 16, 2024 · Oct 16, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/validphys2/src/validphys/closuretest/inconsistent_closuretest/inconsistent_ct.py b/validphys2/src/validphys/closuretest/inconsistent_closuretest/inconsistent_ct.py
@@ -0,0 +1,215 @@
+"""
+This module contains the InconsistentCommonData class which is meant to have all the
+methods needed in order to introduce an inconsistency within a Closure Test.
+"""
+
+import yaml
+import dataclasses
+from validphys.coredata import CommonData
+import pandas as pd
+
+
+@dataclasses.dataclass(eq=False)
+class InconsistentCommonData(CommonData):
+    """
+    Class that inherits all of the methods
+    of coredata.CommonData class.
+
+    This class is meant to have all the
+    methods needed in order to introduce
+    an inconsistency within a Closure Test.
+    """
+
+    setname: str
+    ndata: int
+    commondataproc: str
+    nkin: int
+    nsys: int
+    commondata_table: pd.DataFrame = dataclasses.field(repr=False)
+    systype_table: pd.DataFrame = dataclasses.field(repr=False)
+    systematics_table: pd.DataFrame = dataclasses.field(default=None, repr=False)
+    _systematic_errors: any = dataclasses.field(default=None, init=False)
+
+    @property
+    def systematic_errors(self):
+        """
+        Overrides the systematic_errors method of the CommonData class.
+
+        This is done in order to allow the systematic_errors to be a property
+        and hence to be able to assign values to it (setter).
+        """
+        if self._systematic_errors is None:
+            return super().systematic_errors()
+        return self._systematic_errors
+
+    @systematic_errors.setter
+    def systematic_errors(self, value):
+        # Define the setter to allow assignment to systematic_errors
+        self._systematic_errors = value
+
+    def select_systype_table_indices(self, treatment_names, names_uncertainties):
+        """
+        Is used to get the indices of the systype_table that correspond to the
+        intersection of the treatment_names and names_uncertainties lists.
+
+        Parameters
+        ----------
+        treatment_names : list
+            list of the names of the treatments that should be selected
+            possible values are: MULT, ADD
+
+        names_uncertainties : list
+            list of the names of the uncertainties that should be selected
+            possible values are: CORR, UNCORR, THEORYCORR, THEORYUNCORR, SPECIAL
+            SPECIAL is used for intra-dataset systematics
+
+        Returns
+        -------
+        systype_tab.index : pd.Index
+        """
+        # check that names_uncertainties only contains either CORR, UNCORR, THEORYCORR, THEORYUNCORR or SPECIAL
+        # if not raise an error
+        if not all(
+            name in ["CORR", "UNCORR", "THEORYCORR", "THEORYUNCORR", "SPECIAL"]
+            for name in names_uncertainties
+        ):
+            raise ValueError(
+                "names_uncertainties should only contain either CORR, UNCORR, THEORYCORR, THEORYUNCORR or SPECIAL"
+            )
+
+        # if "SPECIAL", then we need to select the intra-dataset systematics
+        if "SPECIAL" in names_uncertainties:
+            # avoid circular import error
+            from validphys.covmats import INTRA_DATASET_SYS_NAME
+
+            # note: | operator allows to extend the condition so as to also include the names_uncertainties
+            systype_tab = self.systype_table[
+                (self.systype_table["treatment"].isin(treatment_names))
+                & (
+                    ~self.systype_table["name"].isin(INTRA_DATASET_SYS_NAME)
+                    | self.systype_table["name"].isin(
+                        [name for name in names_uncertainties if name != "SPECIAL"]
+                    )
+                )
+            ]
+
+        else:
+            systype_tab = self.systype_table[
+                (self.systype_table["treatment"].isin(treatment_names))
+                & (self.systype_table["name"].isin(names_uncertainties))
+            ]
+
+        return systype_tab.index
+
+    def rescale_systematics(self, treatment_names, names_uncertainties, sys_rescaling_factor):
+        """
+        Rescale the columns of the systematic_errors() that are included in the
+        the names_uncertainties list. And return the rescaled table.
+
+        Parameters
+        ----------
+        treatment_names : list
+            list of the names of the treatments that should be rescaled
+            possible values are: MULT, ADD
+
+        names_uncertainties : list
+            list of the names of the uncertainties that should be rescaled
+            possible values are: CORR, UNCORR, THEORYCORR, THEORYUNCORR, SPECIAL
+            SPECIAL is used for intra-dataset systematics
+
+        sys_rescaling_factor : float
+            factor by which the systematics should be rescaled
+
+        Returns
+        -------
+        self.systematics_table : pd.DataFrame
+        """
+
+        sys_table = self.systematic_errors.copy()
+
+        # select the columns of the systematics_table that should be rescaled
+        systype_idx = self.select_systype_table_indices(
+            treatment_names=treatment_names, names_uncertainties=names_uncertainties
+        )
+
+        # rescale columns of the systematics_table that are included in the index systype_idx
+        sys_table.iloc[:, systype_idx - 1] *= sys_rescaling_factor
+
+        return sys_table
+
+    def process_commondata(
+        self, treatment_names, names_uncertainties, sys_rescaling_factor, inconsistent_datasets
+    ):
+        """
+        returns a commondata instance
+        with modified systematics.
+        Note that if commondata.setname
+        is not within the inconsistent_datasets or if both ADD and
+        MULT are False, then the commondata object
+        will not be modified.
+
+        Parameters
+        ----------
+        treatment_names : list
+                            list of the names of the treatments that should be rescaled
+                            possible values are: MULT, ADD
+
+        names_uncertainties : list
+                            list of the names of the uncertainties that should be rescaled
+                            possible values are: CORR, UNCORR, THEORYCORR, THEORYUNCORR, SPECIAL
+                            SPECIAL is used for intra-dataset systematics
+
+        sys_rescaling_factor : float, int
+
+        inconsistent_datasets : list
+                            list of the datasets for which an inconsistency should be introduced
+
+        Returns
+        -------
+        validphys.inconsistent_ct.InconsistentCommonData
+        """
+        new_commondata = self
+
+        if not self.setname in inconsistent_datasets:
+            return self
+
+        # needs setter to allow assignment to systematic_errors
+        new_commondata.systematic_errors = self.rescale_systematics(
+            treatment_names, names_uncertainties, sys_rescaling_factor
+        )
+
+        return new_commondata
+
+    def export_uncertainties(self, buffer):
+        """
+        Same as the export_uncertainties method of the CommonData class.
+        The only difference is that systematic_errors is now a property of the class
+        and not a method.
+        """
+        definitions = {}
+        for idx, row in self.systype_table.iterrows():
+            if row["name"] != "SKIP":
+                definitions[f"sys_{idx}"] = {"treatment": row["treatment"], "type": row["name"]}
+
+        # Order the definitions by treatment as ADD, MULT
+        # TODO: make it so that it corresponds to the original order exactly
+        sorted_definitions = {
+            k: v for k, v in sorted(definitions.items(), key=lambda item: item[1]["treatment"])
+        }
+        bins = []
+
+        for idx, row in self.systematic_errors.iterrows():
+            tmp = {"stat": float(self.stat_errors[idx])}
+            # Hope things come in the right order...
+            for key_name, val in zip(sorted_definitions, row):
+                tmp[key_name] = float(val)
+
+            bins.append(tmp)
+
+        sorted_definitions["stat"] = {
+            "description": "Uncorrelated statistical uncertainties",
+            "treatment": "ADD",
+            "type": "UNCORR",
+        }
+        ret = {"definitions": sorted_definitions, "bins": bins}
+        yaml.safe_dump(ret, buffer)
diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
@@ -477,6 +477,33 @@ def parse_dataset_input(self, dataset: Mapping):
             variant=variant,
         )
 
+    def parse_inconsistent_data_settings(self, settings):
+        """
+        Parse the inconsistent data settings from the yaml file.
+        """
+        known_keys = {
+            "treatment_names",
+            "names_uncertainties",
+            "inconsistent_datasets",
+            "sys_rescaling_factor",
+        }
+
+        kdiff = settings.keys() - known_keys
+        for k in kdiff:
+            log.warning(
+                ConfigError(f"Key '{k}' in inconsistent_data_settings not known.", k, known_keys)
+            )
+
+        ict_data_settings = {}
+
+        ict_data_settings["treatment_names"] = settings.get("treatment_names", [])
+        ict_data_settings["names_uncertainties"] = settings.get("names_uncertainties", [])
+
+        ict_data_settings["inconsistent_datasets"] = settings.get("inconsistent_datasets", [])
+        ict_data_settings["sys_rescaling_factor"] = settings.get("sys_rescaling_factor", 1)
+
+        return ict_data_settings
+
     def parse_use_fitcommondata(self, do_use: bool):
         """Use the commondata files in the fit instead of those in the data
         directory."""
@@ -1716,7 +1743,9 @@ def produce_scale_variation_theories(self, theoryid, point_prescription):
         return {"theoryids": NSList(theoryids, nskey="theoryid")}
 
     @configparser.explicit_node
-    def produce_filter_data(self, fakedata: bool = False, theorycovmatconfig=None):
+    def produce_filter_data(
+        self, fakedata: bool = False, theorycovmatconfig=None, inconsistent_fakedata: bool = False
+    ):
         """Set the action used to filter the data to filter either real or
         closure data. If the closure data filter is being used and if the
         theory covariance matrix is not being closure tested then filter
@@ -1735,6 +1764,10 @@ def produce_filter_data(self, fakedata: bool = False, theorycovmatconfig=None):
                     "Generating closure test data which samples from the theory "
                     "covariance matrix has not been implemented yet."
                 )
+            elif inconsistent_fakedata:
+                log.warning("Using filter for inconsistent closure data")
+                return validphys.filters.filter_inconsistent_closure_data_by_experiment
+
             return validphys.filters.filter_closure_data_by_experiment
 
     @configparser.explicit_node

diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py
@@ -446,6 +446,7 @@ def export_uncertainties(self, buffer):
             k: v for k, v in sorted(definitions.items(), key=lambda item: item[1]["treatment"])
         }
         bins = []
+
         for idx, row in self.systematic_errors().iterrows():
             tmp = {"stat": float(self.stat_errors[idx])}
             # Hope things come in the right order...