From 6d9bdce6389f0e7916cde31e6dafa900f79eb91c Mon Sep 17 00:00:00 2001 From: Alessandro Candido Date: Tue, 19 Oct 2021 18:07:55 +0200 Subject: [PATCH 1/9] Add dump to tar option for output --- benchmarks/runners/sandbox.py | 4 +- src/eko/output.py | 84 ++++++++++++++++++++++++++++------- src/eko/runner.py | 1 + 3 files changed, 72 insertions(+), 17 deletions(-) diff --git a/benchmarks/runners/sandbox.py b/benchmarks/runners/sandbox.py index 0478d87c7..7e3d31310 100644 --- a/benchmarks/runners/sandbox.py +++ b/benchmarks/runners/sandbox.py @@ -58,8 +58,8 @@ def generate_operators(): def doit(self): theory_updates = { - **nnpdf_base_theory, - "PTO": 2, + **ffns3, + "PTO": 0, # "ModEv": "EXA", # "XIR": 0.5, # "fact_to_ren_scale_ratio": 2.0, diff --git a/src/eko/output.py b/src/eko/output.py index 8ded07387..b36d4bc6a 100644 --- a/src/eko/output.py +++ b/src/eko/output.py @@ -3,6 +3,9 @@ This file contains the output management """ import logging +import pathlib +import shutil +import tarfile import warnings import lz4.frame @@ -268,7 +271,7 @@ def to_evol(self, source=True, target=False): if target: self["targetpids"] = br.evol_basis_pids - def get_raw(self, binarize=True): + def get_raw(self, binarize=True, skip_q2_grid=False): """ Serialize result as dict/YAML. @@ -295,6 +298,7 @@ def get_raw(self, binarize=True): "q2_ref", ]: out[f] = self[f] + # list() work both for np.array and list out["inputpids"] = list(self["inputpids"]) out["targetpids"] = list(self["targetpids"]) # make raw lists @@ -302,19 +306,22 @@ def get_raw(self, binarize=True): for k in ["interpolation_xgrid", "targetgrid", "inputgrid"]: out[k] = self[k].tolist() # make operators raw - for q2, op in self["Q2grid"].items(): - out["Q2grid"][q2] = dict() - for k, v in op.items(): - if k == "alphas": - out["Q2grid"][q2][k] = float(v) - continue - if binarize: - out["Q2grid"][q2][k] = lz4.frame.compress(v.tobytes()) - else: - out["Q2grid"][q2][k] = v.tolist() + if not skip_q2_grid: + for q2, op in self["Q2grid"].items(): + out["Q2grid"][q2] = dict() + for k, v in op.items(): + if k == "alphas": + out["Q2grid"][q2][k] = float(v) + continue + if binarize: + out["Q2grid"][q2][k] = lz4.frame.compress(v.tobytes()) + else: + out["Q2grid"][q2][k] = v.tolist() + else: + out["Q2grid"] = self["Q2grid"] return out - def dump_yaml(self, stream=None, binarize=True): + def dump_yaml(self, stream=None, binarize=True, skip_q2_grid=False): """ Serialize result as YAML. @@ -332,10 +339,10 @@ def dump_yaml(self, stream=None, binarize=True): Null, if written sucessfully to stream """ # TODO explicitly silence yaml - out = self.get_raw(binarize) + out = self.get_raw(binarize, skip_q2_grid=skip_q2_grid) return yaml.dump(out, stream) - def dump_yaml_to_file(self, filename, binarize=True): + def dump_yaml_to_file(self, filename, binarize=True, skip_q2_grid=False): """ Writes YAML representation to a file. @@ -352,9 +359,52 @@ def dump_yaml_to_file(self, filename, binarize=True): result of dump(output, stream), i.e. Null if written sucessfully """ with open(filename, "w") as f: - ret = self.dump_yaml(f, binarize) + ret = self.dump_yaml(f, binarize, skip_q2_grid=skip_q2_grid) return ret + def dump_tar(self, tarname): + """ + Writes representation into a tar archive containing: + + - metadata (in YAML) + - operator (in numpy ``.npy`` format) + + Parameters + ---------- + tarname : str + target file name + """ + tarpath = pathlib.Path(tarname) + if tarpath.suffix != ".tar": + raise ValueError(f"'{tarname}' is not a valid tar filename, wrong suffix") + tmpdir = tarpath.parent / tarpath.stem + + tmpdir.mkdir(parents=True) + + cls = self.__class__ + metadata = cls(**{str(k): v for k, v in self.items() if k != "Q2grid"}) + metadata["Q2grid"] = list(self["Q2grid"].keys()) + + yamlname = tmpdir / "metadata.yaml" + metadata.dump_yaml_to_file(yamlname, skip_q2_grid=True) + + for kind in next(iter(self["Q2grid"].values())).keys(): + arrayname = tmpdir / kind + operator = np.stack([q2[kind] for q2 in self["Q2grid"].values()]) + np.save(arrayname, operator) + + for fp in tmpdir.glob("*.npy"): + with lz4.frame.open(fp.with_suffix(fp.suffix + ".lz4"), "wb") as fo: + with open(fp, "rb") as fi: + fo.write(fi.read()) + + fp.unlink() + + with tarfile.open(tarpath, "w") as tar: + tar.add(tmpdir) + + shutil.rmtree(tmpdir) + @classmethod def load_yaml(cls, stream): """ @@ -410,3 +460,7 @@ def load_yaml_from_file(cls, filename): with open(filename) as o: obj = Output.load_yaml(o) return obj + + @classmethod + def load_from_tar(cls, tarname): + """""" diff --git a/src/eko/runner.py b/src/eko/runner.py index 03f6d910f..89eb2131d 100644 --- a/src/eko/runner.py +++ b/src/eko/runner.py @@ -143,4 +143,5 @@ def get_output(self): targetbasis = np.array(targetbasis) if inputbasis is not None or targetbasis is not None: self.out.flavor_reshape(targetbasis=targetbasis, inputbasis=inputbasis) + __import__("pdb").set_trace() return copy.deepcopy(self.out) From a55c24c1bff7ab75d7cdcf5c8b66208945ca6b5f Mon Sep 17 00:00:00 2001 From: Alessandro Candido Date: Tue, 19 Oct 2021 18:15:43 +0200 Subject: [PATCH 2/9] Remove pdb left behind --- src/eko/runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/eko/runner.py b/src/eko/runner.py index 89eb2131d..03f6d910f 100644 --- a/src/eko/runner.py +++ b/src/eko/runner.py @@ -143,5 +143,4 @@ def get_output(self): targetbasis = np.array(targetbasis) if inputbasis is not None or targetbasis is not None: self.out.flavor_reshape(targetbasis=targetbasis, inputbasis=inputbasis) - __import__("pdb").set_trace() return copy.deepcopy(self.out) From 812f4096ab5e32bf94c872c92f038c624b695a62 Mon Sep 17 00:00:00 2001 From: Alessandro Candido Date: Thu, 21 Oct 2021 19:53:09 +0200 Subject: [PATCH 3/9] Add output loading from tar --- src/eko/output.py | 65 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/src/eko/output.py b/src/eko/output.py index b36d4bc6a..4ea27f72a 100644 --- a/src/eko/output.py +++ b/src/eko/output.py @@ -2,6 +2,7 @@ """ This file contains the output management """ +import io import logging import pathlib import shutil @@ -406,7 +407,7 @@ def dump_tar(self, tarname): shutil.rmtree(tmpdir) @classmethod - def load_yaml(cls, stream): + def load_yaml(cls, stream, skip_q2_grid=False): """ Load YAML representation from stream @@ -429,20 +430,21 @@ def load_yaml(cls, stream): for k in ["interpolation_xgrid", "inputgrid", "targetgrid"]: obj[k] = np.array(obj[k]) # make operators numpy - for op in obj["Q2grid"].values(): - for k, v in op.items(): - if k == "alphas": - v = float(v) - elif isinstance(v, list): - v = np.array(v) - elif isinstance(v, bytes): - v = np.frombuffer(lz4.frame.decompress(v)) - v = v.reshape(len_tpids, len_tgrid, len_ipids, len_igrid) - op[k] = v + if not skip_q2_grid: + for op in obj["Q2grid"].values(): + for k, v in op.items(): + if k == "alphas": + v = float(v) + elif isinstance(v, list): + v = np.array(v) + elif isinstance(v, bytes): + v = np.frombuffer(lz4.frame.decompress(v)) + v = v.reshape(len_tpids, len_tgrid, len_ipids, len_igrid) + op[k] = v return cls(obj) @classmethod - def load_yaml_from_file(cls, filename): + def load_yaml_from_file(cls, filename, skip_q2_grid=False): """ Load YAML representation from file @@ -458,9 +460,42 @@ def load_yaml_from_file(cls, filename): """ obj = None with open(filename) as o: - obj = Output.load_yaml(o) + obj = Output.load_yaml(o, skip_q2_grid) return obj @classmethod - def load_from_tar(cls, tarname): - """""" + def load_tar(cls, tarname): + """ """ + + tarpath = pathlib.Path(tarname) + if tarpath.suffix != ".tar": + raise ValueError(f"'{tarname}' is not a valid tar filename, wrong suffix") + tmpdir = tarpath.parent / tarpath.stem + + with tarfile.open(tarpath, "r") as tar: + tar.extractall(tmpdir.parent) + + # metadata = cls(**{str(k): v for k, v in self.items() if k != "Q2grid"}) + # metadata["Q2grid"] = list(self["Q2grid"].keys()) + + yamlname = tmpdir / "metadata.yaml" + metadata = cls.load_yaml_from_file(yamlname, skip_q2_grid=True) + + grids = {} + for fp in tmpdir.glob("*.npy.lz4"): + with lz4.frame.open(fp, "rb") as fd: + stream = io.BytesIO(fd.read()) + stream.seek(0) + grids[pathlib.Path(fp.stem).stem] = np.load(stream) + + fp.unlink() + + q2grid = metadata["Q2grid"] + operator_grid = {} + for q2, slices in zip(q2grid, zip(*grids.values())): + operator_grid[q2] = dict(zip(grids.keys(), slices)) + metadata["Q2grid"] = operator_grid + + shutil.rmtree(tmpdir) + + return metadata From a3c97eb5ad4e2e1014cf5a09a6130c1709a479e8 Mon Sep 17 00:00:00 2001 From: Alessandro Candido Date: Thu, 21 Oct 2021 20:03:11 +0200 Subject: [PATCH 4/9] Avoid dumping on disk temporary files --- src/eko/output.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/eko/output.py b/src/eko/output.py index 4ea27f72a..cf847c473 100644 --- a/src/eko/output.py +++ b/src/eko/output.py @@ -390,16 +390,12 @@ def dump_tar(self, tarname): metadata.dump_yaml_to_file(yamlname, skip_q2_grid=True) for kind in next(iter(self["Q2grid"].values())).keys(): - arrayname = tmpdir / kind operator = np.stack([q2[kind] for q2 in self["Q2grid"].values()]) - np.save(arrayname, operator) - - for fp in tmpdir.glob("*.npy"): - with lz4.frame.open(fp.with_suffix(fp.suffix + ".lz4"), "wb") as fo: - with open(fp, "rb") as fi: - fo.write(fi.read()) - - fp.unlink() + stream = io.BytesIO() + np.save(stream, operator) + stream.seek(0) + with lz4.frame.open((tmpdir / kind).with_suffix(".npy.lz4"), "wb") as fo: + fo.write(stream.read()) with tarfile.open(tarpath, "w") as tar: tar.add(tmpdir) From ca560f1b658a26571655684136b4e9f3704bf007 Mon Sep 17 00:00:00 2001 From: Alessandro Candido Date: Thu, 21 Oct 2021 20:15:37 +0200 Subject: [PATCH 5/9] Use an actual temporary directory --- src/eko/output.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/eko/output.py b/src/eko/output.py index cf847c473..8cb624f4d 100644 --- a/src/eko/output.py +++ b/src/eko/output.py @@ -7,6 +7,7 @@ import pathlib import shutil import tarfile +import tempfile import warnings import lz4.frame @@ -378,29 +379,29 @@ def dump_tar(self, tarname): tarpath = pathlib.Path(tarname) if tarpath.suffix != ".tar": raise ValueError(f"'{tarname}' is not a valid tar filename, wrong suffix") - tmpdir = tarpath.parent / tarpath.stem - - tmpdir.mkdir(parents=True) - cls = self.__class__ - metadata = cls(**{str(k): v for k, v in self.items() if k != "Q2grid"}) - metadata["Q2grid"] = list(self["Q2grid"].keys()) + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = pathlib.Path(tmpdir) - yamlname = tmpdir / "metadata.yaml" - metadata.dump_yaml_to_file(yamlname, skip_q2_grid=True) + cls = self.__class__ + metadata = cls(**{str(k): v for k, v in self.items() if k != "Q2grid"}) + metadata["Q2grid"] = list(self["Q2grid"].keys()) - for kind in next(iter(self["Q2grid"].values())).keys(): - operator = np.stack([q2[kind] for q2 in self["Q2grid"].values()]) - stream = io.BytesIO() - np.save(stream, operator) - stream.seek(0) - with lz4.frame.open((tmpdir / kind).with_suffix(".npy.lz4"), "wb") as fo: - fo.write(stream.read()) + yamlname = tmpdir / "metadata.yaml" + metadata.dump_yaml_to_file(yamlname, skip_q2_grid=True) - with tarfile.open(tarpath, "w") as tar: - tar.add(tmpdir) + for kind in next(iter(self["Q2grid"].values())).keys(): + operator = np.stack([q2[kind] for q2 in self["Q2grid"].values()]) + stream = io.BytesIO() + np.save(stream, operator) + stream.seek(0) + with lz4.frame.open( + (tmpdir / kind).with_suffix(".npy.lz4"), "wb" + ) as fo: + fo.write(stream.read()) - shutil.rmtree(tmpdir) + with tarfile.open(tarpath, "w") as tar: + tar.add(tmpdir, arcname=tarpath.stem) @classmethod def load_yaml(cls, stream, skip_q2_grid=False): From 4605086000a7fe78f772b51c9261d6241350cb3e Mon Sep 17 00:00:00 2001 From: Alessandro Candido Date: Thu, 21 Oct 2021 20:20:48 +0200 Subject: [PATCH 6/9] Use temporary directory for reading too --- src/eko/output.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/src/eko/output.py b/src/eko/output.py index 8cb624f4d..99dd9c6d7 100644 --- a/src/eko/output.py +++ b/src/eko/output.py @@ -467,32 +467,32 @@ def load_tar(cls, tarname): tarpath = pathlib.Path(tarname) if tarpath.suffix != ".tar": raise ValueError(f"'{tarname}' is not a valid tar filename, wrong suffix") - tmpdir = tarpath.parent / tarpath.stem - with tarfile.open(tarpath, "r") as tar: - tar.extractall(tmpdir.parent) + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = pathlib.Path(tmpdir) - # metadata = cls(**{str(k): v for k, v in self.items() if k != "Q2grid"}) - # metadata["Q2grid"] = list(self["Q2grid"].keys()) + with tarfile.open(tarpath, "r") as tar: + tar.extractall(tmpdir) - yamlname = tmpdir / "metadata.yaml" - metadata = cls.load_yaml_from_file(yamlname, skip_q2_grid=True) + # metadata = cls(**{str(k): v for k, v in self.items() if k != "Q2grid"}) + # metadata["Q2grid"] = list(self["Q2grid"].keys()) - grids = {} - for fp in tmpdir.glob("*.npy.lz4"): - with lz4.frame.open(fp, "rb") as fd: - stream = io.BytesIO(fd.read()) - stream.seek(0) - grids[pathlib.Path(fp.stem).stem] = np.load(stream) + yamlname = tmpdir / tarpath.stem / "metadata.yaml" + metadata = cls.load_yaml_from_file(yamlname, skip_q2_grid=True) - fp.unlink() + grids = {} + for fp in (tmpdir / tarpath.stem).glob("*.npy.lz4"): + with lz4.frame.open(fp, "rb") as fd: + stream = io.BytesIO(fd.read()) + stream.seek(0) + grids[pathlib.Path(fp.stem).stem] = np.load(stream) - q2grid = metadata["Q2grid"] - operator_grid = {} - for q2, slices in zip(q2grid, zip(*grids.values())): - operator_grid[q2] = dict(zip(grids.keys(), slices)) - metadata["Q2grid"] = operator_grid + fp.unlink() - shutil.rmtree(tmpdir) + q2grid = metadata["Q2grid"] + operator_grid = {} + for q2, slices in zip(q2grid, zip(*grids.values())): + operator_grid[q2] = dict(zip(grids.keys(), slices)) + metadata["Q2grid"] = operator_grid return metadata From 7d4867922beabf51d5898e6a5af41e9a8a2dd261 Mon Sep 17 00:00:00 2001 From: Alessandro Candido Date: Fri, 22 Oct 2021 12:00:50 +0200 Subject: [PATCH 7/9] Add eko version to output metadata --- src/eko/output.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/eko/output.py b/src/eko/output.py index 99dd9c6d7..9c28b32f1 100644 --- a/src/eko/output.py +++ b/src/eko/output.py @@ -15,7 +15,7 @@ import yaml from . import basis_rotation as br -from . import interpolation +from . import interpolation, version logger = logging.getLogger(__name__) @@ -290,9 +290,7 @@ def get_raw(self, binarize=True, skip_q2_grid=False): dictionary which will be written on output """ # prepare output dict - out = { - "Q2grid": {}, - } + out = {"Q2grid": {}, "eko_version": version.full_version} # dump raw elements for f in [ "interpolation_polynomial_degree", From 663fd0f3bd5c8bf5967bd16a7d70c2be047dd3c8 Mon Sep 17 00:00:00 2001 From: Alessandro Candido Date: Fri, 22 Oct 2021 12:14:35 +0200 Subject: [PATCH 8/9] Address pylint complaints --- src/eko/output.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/eko/output.py b/src/eko/output.py index 9c28b32f1..8080c2bbf 100644 --- a/src/eko/output.py +++ b/src/eko/output.py @@ -5,7 +5,6 @@ import io import logging import pathlib -import shutil import tarfile import tempfile import warnings @@ -331,6 +330,9 @@ def dump_yaml(self, stream=None, binarize=True, skip_q2_grid=False): if given, dump is written on it binarize : bool dump in binary format (instead of list format) + skip_q2_grid : bool + avoid dumping Q2grid (i.e. the actual operators) into the yaml + file (defualt: ``False``) Returns ------- @@ -352,6 +354,9 @@ def dump_yaml_to_file(self, filename, binarize=True, skip_q2_grid=False): target file name binarize : bool dump in binary format (instead of list format) + skip_q2_grid : bool + avoid dumping Q2grid (i.e. the actual operators) into the yaml + file (defualt: ``False``) Returns ------- @@ -410,6 +415,9 @@ def load_yaml(cls, stream, skip_q2_grid=False): ---------- stream : any source stream + skip_q2_grid : bool + avoid loading Q2grid (i.e. the actual operators) from the yaml + file (defualt: ``False``) Returns ------- @@ -447,6 +455,9 @@ def load_yaml_from_file(cls, filename, skip_q2_grid=False): ---------- filename : str source file name + skip_q2_grid : bool + avoid loading Q2grid (i.e. the actual operators) from the yaml + file (defualt: ``False``) Returns ------- @@ -460,7 +471,20 @@ def load_yaml_from_file(cls, filename, skip_q2_grid=False): @classmethod def load_tar(cls, tarname): - """ """ + """ + Load tar representation from file (compliant with :meth:`dump_tar` + output). + + Parameters + ---------- + tarname : str + source tar name + + Returns + ------- + obj : output + loaded object + """ tarpath = pathlib.Path(tarname) if tarpath.suffix != ".tar": From d620a68c715eed0331ee538d8b4d49a60ea8fc3b Mon Sep 17 00:00:00 2001 From: Alessandro Candido Date: Fri, 22 Oct 2021 13:21:42 +0200 Subject: [PATCH 9/9] Add loading/dumping tar test --- tests/test_output.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_output.py b/tests/test_output.py index 5a1ccd166..d95017d0e 100644 --- a/tests/test_output.py +++ b/tests/test_output.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- import copy import io +import pathlib +import tempfile from unittest import mock import numpy as np @@ -96,6 +98,20 @@ def test_io(self): np.testing.assert_almost_equal( o3["interpolation_xgrid"], d["interpolation_xgrid"] ) + # repeat for tar + fn = "test.tar" + with tempfile.TemporaryDirectory() as folder: + fp = pathlib.Path(folder) / fn + o1.dump_tar(fp) + o4 = output.Output.load_tar(fp) + np.testing.assert_almost_equal( + o4["interpolation_xgrid"], d["interpolation_xgrid"] + ) + fn = "test" + with pytest.raises(ValueError, match="wrong suffix"): + o1.dump_tar(fn) + with pytest.raises(ValueError, match="wrong suffix"): + o1.load_tar(fn) def test_io_bin(self): d = self.fake_output()