first version of the nnpdf data package , with versioning

add utility function to read metadata just from dataset name deprecate a bunch of functions
NNPDF · Dec 6, 2024 · c37ac13 · c37ac13
1 parent ad73d20
commit c37ac13
Show file tree

Hide file tree

Showing 24 changed files with 664 additions and 525 deletions.
diff --git a/deprecated_functions.py b/deprecated_functions.py
@@ -0,0 +1,138 @@
+"""
+Note: this module will be removed after the next tag, don't use anything from here
+"""
+
+import dataclasses
+import logging
+from operator import attrgetter
+
+import pandas as pd
+
+from nnpdf_data.coredata import CommonData
+
+log = logging.getLogger(__name__)
+
+log.warning(
+    "You are loading deprecated functionality that use the old commondata parser. This is no longer supported and will be removed in the near future"
+)
+
+
+### Old commondata:
+### All code below this line is deprecated and will be removed
+def load_commondata_old(commondatafile, systypefile, setname):
+    """Parse a commondata file  and a systype file into a CommonData.
+
+    Parameters
+    ----------
+    commondatafile : file or path to file
+    systypefile : file or path to file
+
+    Returns
+    -------
+    commondata : CommonData
+        An object containing the data and information from the commondata
+        and systype files.
+    """
+    # First parse commondata file
+    commondatatable = pd.read_csv(commondatafile, sep=r"\s+", skiprows=1, header=None)
+    # Remove NaNs
+    # TODO: replace commondata files with bad formatting
+    # Build header
+    commondataheader = ["entry", "process", "kin1", "kin2", "kin3", "data", "stat"]
+    nsys = (commondatatable.shape[1] - len(commondataheader)) // 2
+
+    commondataheader += ["ADD", "MULT"] * nsys
+    commondatatable.columns = commondataheader
+    commondatatable.set_index("entry", inplace=True)
+    ndata = len(commondatatable)
+    commondataproc = commondatatable["process"][1]
+    # Check for consistency with commondata metadata
+    cdmetadata = peek_commondata_metadata(commondatafile)
+    if (nsys, ndata) != attrgetter("nsys", "ndata")(cdmetadata):
+        raise ValueError(f"Commondata table information does not match metadata for {setname}")
+
+    # Now parse the systype file
+    systypetable = parse_systypes(systypefile)
+
+    # Populate CommonData object
+    return CommonData(
+        setname=setname,
+        ndata=ndata,
+        commondataproc=commondataproc,
+        nkin=3,
+        nsys=nsys,
+        commondata_table=commondatatable,
+        systype_table=systypetable,
+        legacy=True,
+    )
+
+
+def parse_systypes(systypefile):
+    """Parses a systype file and returns a pandas dataframe."""
+    systypeheader = ["sys_index", "treatment", "name"]
+    try:
+        systypetable = pd.read_csv(
+            systypefile, sep=r"\s+", names=systypeheader, skiprows=1, header=None
+        )
+        systypetable.dropna(axis="columns", inplace=True)
+    # Some datasets e.g. CMSWCHARMRAT have no systematics
+    except pd.errors.EmptyDataError:
+        systypetable = pd.DataFrame(columns=systypeheader)
+
+    systypetable.set_index("sys_index", inplace=True)
+
+    return systypetable
+
+
+@dataclasses.dataclass(frozen=True)
+class CommonDataMetadata:
+    """Contains metadata information about the data being read"""
+
+    name: str
+    nsys: int
+    ndata: int
+    process_type: str
+
+
+def peek_commondata_metadata(commondatafilename):
+    """Read some of the properties of the commondata object as a CommonData Metadata"""
+    with open(commondatafilename) as f:
+        try:
+            l = f.readline()
+            name, nsys_str, ndata_str = l.split()
+            l = f.readline()
+            process_type_str = l.split()[1]
+        except Exception:
+            log.error(f"Error processing {commondatafilename}")
+            raise
+
+    return CommonDataMetadata(
+        name, int(nsys_str), int(ndata_str), get_kinlabel_key(process_type_str)
+    )
+
+
+def get_plot_kinlabels(commondata):
+    """Return the LaTex kinematic labels for a given Commondata"""
+    key = commondata.process_type
+
+    # TODO: the keys in KINLABEL_LATEX need to be updated for the new commondata
+    return KINLABEL_LATEX.get(key, key)
+
+
+def get_kinlabel_key(process_label):
+    """
+    Since there is no 1:1 correspondence between latex keys and the old libNNPDF names
+    we match the longest key such that the proc label starts with it.
+    """
+    l = process_label
+    try:
+        if process_label == "EWK_RAP_ASY":
+            # TODO this function is disappearing in this PR
+            l = "EWK_RAP"
+        return next(k for k in sorted(KINLABEL_LATEX, key=len, reverse=True) if l.startswith(k))
+    except StopIteration as e:
+        raise ValueError(
+            "Could not find a set of kinematic "
+            "variables matching  the process %s Check the "
+            "labels defined in commondata.cc. " % (l)
+        ) from e
diff --git a/doc/sphinx/source/vp/customplots.rst b/doc/sphinx/source/vp/customplots.rst
@@ -65,7 +65,7 @@ There are two ways to take advantage of resources produced using the
    * Using extra modules: Additional Python modules or files can be passed to
      ``validphys`` using the ``--extra-modules`` (or ``-x``) flag. The
      functions in these modules then act ``validphys`` providers and can take
-     resources from ``validpys`` as input. This approach allows the 
+     resources from ``validpys`` as input. This approach allows the
      immediate use of runcards or the default styles. One limitation is that
      there is currently no way of adding production rules or parsers in this
      way. Prefer this for actions that are too difficult to upstream to
@@ -76,7 +76,7 @@ There are two ways to take advantage of resources produced using the
          from matplotlib.figure import Figure
          from reportengine.figure import figure
 
-         from validphys.commondataparser import load_commondata
+         from nnpdf_data.commondataparser import load_commondata
 
          # A simple plot that probably should be in validphys to begin with.
 
@@ -103,7 +103,7 @@ There are two ways to take advantage of resources produced using the
 
 
 
-Note that both of these come at the cost of risking future breakage 
+Note that both of these come at the cost of risking future breakage
 somewhat  as we don't guarantee any sort of stability on the internal
 interfaces.
 

diff --git a/doc/sphinx/source/vp/pydataobjs.rst b/doc/sphinx/source/vp/pydataobjs.rst
@@ -143,8 +143,8 @@ Loading CommonData
 ------------------
 
 The underlying functions for loading CommonData can be found in
-:py:mod:`validphys.commondataparser`. The data is loaded
-as :py:class:`validphys.coredata.CommonData`, which uses the
+:py:mod:`nnpdf_data.commondataparser`. The data is loaded
+as :py:class:`nnpdf_data.coredata.CommonData`, which uses the
 `dataclasses <https://docs.python.org/3/library/dataclasses.html>`_ module
 which automatically generates some special methods for the class. The
 underlying data is stored as DataFrames, and so can be used
@@ -153,7 +153,7 @@ with the standard pandas machinery::
     import pandas as pd
 
     from validphys.api import API
-    from validphys.commondataparser import load_commondata
+    from nnpdf_data.commondataparser import load_commondata
     # define dataset settings
     ds_input={'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10}
     # first get the CommonDataSpec
@@ -162,11 +162,11 @@ with the standard pandas machinery::
     assert isinstance(lcd.central_values, pd.Series)
     assert isinstance(lcd.systematics_table, pd.DataFrame)
 
-The :py:class:`validphys.coredata.CommonData` class has a method which returns
+The :py:class:`nnpdf_data.coredata.CommonData` class has a method which returns
 a new instance of the class with cuts applied::
 
     from validphys.api import API
-    from validphys.commondataparser import load_commondata
+    from nnpdf_data.commondataparser import load_commondata
     # define dataset and additional settings
     ds_input={'dataset': 'CMSZDIFF12', 'cfac':('QCD', 'NRM'), 'sys':10}
     inp = {
@@ -193,7 +193,7 @@ more convenient than calling the underlying functions::
 Loading Covariance Matrices
 ---------------------------
 
-Functions which take :py:class:`validphys.coredata.CommonData` s and return
+Functions which take :py:class:`nnpdf_data.coredata.CommonData` s and return
 covariance matrices can be found in
 :py:mod:`validphys.covmats`. As with the commondata
 the functions can be called in scripts directly::

diff --git a/nnpdf_data/examples_of_use.py b/nnpdf_data/examples_of_use.py
@@ -0,0 +1,34 @@
+"""
+    This file contains examples of use of ``nnpdf_data`` as a library.
+    This library is currently in pre-alpha form and should not be considered stable.
+
+    The functions and examples in this file will be eventually removed but might become
+    part of the library as an external user-facing interface.
+
+    There is currently no user-facing interface so no stability is expected.
+"""
+
+from nnpdf_data import path_commondata
+from nnpdf_data.commondataparser import parse_new_metadata
+
+
+def parse_dataset(dataset, variant=None):
+    """Given a dataset name, read the observable metadata as a CommonData object.
+    A variant can be given.
+
+    The output is a ``ObservableMetaData`` object, with references to all files
+    that form the dataset but none of them is loaded.
+    This can then be used to _load_ the dataset using load_commondata.
+
+    Example
+    -------
+    >>> from nnpdf_data.commondataparser import load_commondata
+    >>> cd_meta = parse_dataset("LHCB_Z0_7TEV_DIELECTRON_Y")
+    >>> cd = load_commondata(cd_meta)
+    >>> print(cd)
+    CommonData(setname='LHCB_Z0_7TEV_DIELECTRON_Y', ndata=9, commondataproc='DY_Z_Y', nkin=3, nsys=11, legacy=False, legacy_names=['LHCBZ940PB'], kin_variables=['y', 'm_Z2', 'sqrts'])
+    """
+    setname, observable = dataset.rsplit("_", 1)
+    metadata_file = path_commondata / setname / "metadata.yaml"
+    metadata = parse_new_metadata(metadata_file, observable, variant=variant)
+    return metadata
diff --git a/nnpdf_data/nnpdf_data/__init__.py b/nnpdf_data/nnpdf_data/__init__.py
@@ -1,76 +1,22 @@
-from functools import lru_cache
 import pathlib
 
-import yaml
+from ._version import __version__
+from .commondataparser import parse_new_metadata
+from .validphys_compatibility import legacy_to_new_map, legacy_to_new_mapping, new_to_legacy_map
 
 path_vpdata = pathlib.Path(__file__).parent
 path_commondata = path_vpdata / "commondata"
-
-# VP should not have access to this file, only to the products
-_path_legacy_mapping = path_commondata / "dataset_names.yml"
 theory_cards = path_vpdata / "theory_cards"
 
-with open(_path_legacy_mapping) as file:
-    _legacy_to_new_mapping_raw = yaml.load(file, yaml.Loader)
-# Convert strings into a dictionary
-legacy_to_new_mapping = {
-    k: ({"dataset": v} if isinstance(v, str) else v) for k, v in _legacy_to_new_mapping_raw.items()
-}
-
-
-@lru_cache
-def legacy_to_new_map(dataset_name, sys=None):
-    """Find the new dataset name and variant corresponding to an old dataset
-    and systematics choice"""
-    if dataset_name not in legacy_to_new_mapping:
-        return dataset_name, None
-
-    new_name = legacy_to_new_mapping[dataset_name]
-    variant = new_name.get("variant")
-    new_name = new_name["dataset"]
-    if sys is not None:
-        if variant is None:
-            raise KeyError(
-                f"I cannot translate the combination of {dataset_name} and sys: {sys}. Please report this."
-            )
-        variant += f"_{sys}"
-
-    return new_name, variant
-
-
-@lru_cache
-def new_to_legacy_map(dataset_name, variant_used):
-    """Loop over the dictionary and find the right dataset.
-
-    Since it is posible to have more than 1 dataset mapped to the same new one,
-    returns a list of everything that matches.
-
-    This function will loop over the entire dictionary of mappings and selects
-    1. All datasets that match exactly what's in the runcard (dataset & variant): exact_matches
-    2. All datasets that match the dataset name: matches
-    If there are any `exact_matches`, it will return only those; otherwise, return all `matches`
-    if there are no `matches` at all, return None
-    """
-
-    matches = []
-    exact_matches = []
-
-    for old_name, new_info in legacy_to_new_mapping.items():
-        new_name = new_info["dataset"]
-        variant = new_info.get("variant")
 
-        if new_name == dataset_name:
-            matches.append(old_name)
-            # if it's a nuclear DIS data promote legacy to be legacy_dw
-            if "_DW_" in old_name and variant_used == "legacy":
-                variant = "legacy_dw"
+def load_dataset_metadata(dataset_name, variant=None):
+    """Given a dataset name, return the metadata"""
 
-            if variant_used == variant:
-                exact_matches.append(old_name)
+    # Compatibility with old nnpdf names, these two lines
+    # might disappear at any given point
+    if variant is None:
+        dataset_name, variant = legacy_to_new_map(dataset_name)
 
-    # If we found exact matches, return those and stop looking
-    if exact_matches:
-        return exact_matches
-    elif matches:
-        return matches
-    return None
+    setname, observable = dataset_name.rsplit("_", 1)
+    metadata_file = path_commondata / setname / "metadata.yaml"
+    return parse_new_metadata(metadata_file, observable, variant=variant)