Skip to content

Commit

Permalink
Loading Benchmarks (#4477)
Browse files Browse the repository at this point in the history
* Synthetic FF PP NetCDF and loading benchmarks.
  • Loading branch information
trexfeathers authored Feb 14, 2022
1 parent d1d1e00 commit 15bd351
Show file tree
Hide file tree
Showing 6 changed files with 543 additions and 51 deletions.
14 changes: 6 additions & 8 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ jobs:
IRIS_TEST_DATA_PATH: benchmarks/iris-test-data
IRIS_TEST_DATA_VERSION: "2.5"
# Lets us manually bump the cache to rebuild
ENV_CACHE_BUILD: "0"
TEST_DATA_CACHE_BUILD: "2"
PY_VER: 3.8

steps:
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
Expand All @@ -32,19 +34,15 @@ jobs:
run: |
pip install nox
- name: Cache .nox and .asv/env directories
- name: Cache environment directories
id: cache-env-dir
uses: actions/cache@v2
with:
path: |
.nox
benchmarks/.asv/env
# Make sure GHA never gets an exact cache match by using the unique
# github.sha. This means it will always store this run as a new
# cache (Nox may have made relevant changes during run). Cache
# restoration still succeeds via the partial restore-key match.
key: ${{ runner.os }}-${{ github.sha }}
restore-keys: ${{ runner.os }}
$CONDA/pkgs
key: ${{ runner.os }}-${{ hashFiles('requirements/') }}-${{ env.ENV_CACHE_BUILD }}

- name: Cache test data directory
id: cache-test-data
Expand All @@ -62,7 +60,7 @@ jobs:
unzip -q iris-test-data.zip
mkdir --parents ${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_LOC_PATH}
mv iris-test-data-${IRIS_TEST_DATA_VERSION} ${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_PATH}
- name: Set test data var
run: |
echo "OVERRIDE_TEST_DATA_REPOSITORY=${GITHUB_WORKSPACE}/${IRIS_TEST_DATA_PATH}/test_data" >> $GITHUB_ENV
Expand Down
41 changes: 0 additions & 41 deletions benchmarks/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,45 +5,4 @@
# licensing details.
"""Common code for benchmarks."""

import os
from pathlib import Path

# Environment variable names
_ASVDIR_VARNAME = "ASV_DIR" # As set in nightly script "asv_nightly/asv.sh"
_DATADIR_VARNAME = "BENCHMARK_DATA" # For local runs

ARTIFICIAL_DIM_SIZE = int(10e3) # For all artificial cubes, coords etc.

# Work out where the benchmark data dir is.
asv_dir = os.environ.get("ASV_DIR", None)
if asv_dir:
# For an overnight run, this comes from the 'ASV_DIR' setting.
benchmark_data_dir = Path(asv_dir) / "data"
else:
# For a local run, you set 'BENCHMARK_DATA'.
benchmark_data_dir = os.environ.get(_DATADIR_VARNAME, None)
if benchmark_data_dir is not None:
benchmark_data_dir = Path(benchmark_data_dir)


def testdata_path(*path_names):
"""
Return the path of a benchmark test data file.
These are based from a test-data location dir, which is either
${}/data (for overnight tests), or ${} for local testing.
If neither of these were set, an error is raised.
""".format(
_ASVDIR_VARNAME, _DATADIR_VARNAME
)
if benchmark_data_dir is None:
msg = (
"Benchmark data dir is not defined : "
'Either "${}" or "${}" must be set.'
)
raise (ValueError(msg.format(_ASVDIR_VARNAME, _DATADIR_VARNAME)))
path = benchmark_data_dir.joinpath(*path_names)
path = str(path) # Because Iris doesn't understand Path objects yet.
return path
94 changes: 94 additions & 0 deletions benchmarks/benchmarks/generate_data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Copyright Iris contributors
#
# This file is part of Iris and is released under the LGPL license.
# See COPYING and COPYING.LESSER in the root of the repository for full
# licensing details.
"""
Scripts for generating supporting data for benchmarking.
Data generated using Iris should use :func:`run_function_elsewhere`, which
means that data is generated using a fixed version of Iris and a fixed
environment, rather than those that get changed when the benchmarking run
checks out a new commit.
Downstream use of data generated 'elsewhere' requires saving; usually in a
NetCDF file. Could also use pickling but there is a potential risk if the
benchmark sequence runs over two different Python versions.
"""
from inspect import getsource
from os import environ
from pathlib import Path
from subprocess import CalledProcessError, check_output, run
from textwrap import dedent

#: Python executable used by :func:`run_function_elsewhere`, set via env
#: variable of same name. Must be path of Python within an environment that
#: includes Iris (including dependencies and test modules) and Mule.
try:
DATA_GEN_PYTHON = environ["DATA_GEN_PYTHON"]
_ = check_output([DATA_GEN_PYTHON, "-c", "a = True"])
except KeyError:
error = "Env variable DATA_GEN_PYTHON not defined."
raise KeyError(error)
except (CalledProcessError, FileNotFoundError, PermissionError):
error = (
"Env variable DATA_GEN_PYTHON not a runnable python executable path."
)
raise ValueError(error)

# The default location of data files used in benchmarks. Used by CI.
default_data_dir = (Path(__file__).parents[2] / ".data").resolve()
# Optionally override the default data location with environment variable.
BENCHMARK_DATA = Path(environ.get("BENCHMARK_DATA", default_data_dir))
if BENCHMARK_DATA == default_data_dir:
BENCHMARK_DATA.mkdir(exist_ok=True)
elif not BENCHMARK_DATA.is_dir():
message = f"Not a directory: {BENCHMARK_DATA} ."
raise ValueError(message)

# Manual flag to allow the rebuilding of synthetic data.
# False forces a benchmark run to re-make all the data files.
REUSE_DATA = True


def run_function_elsewhere(func_to_run, *args, **kwargs):
"""
Run a given function using the :const:`DATA_GEN_PYTHON` executable.
This structure allows the function to be written natively.
Parameters
----------
func_to_run : FunctionType
The function object to be run.
NOTE: the function must be completely self-contained, i.e. perform all
its own imports (within the target :const:`DATA_GEN_PYTHON`
environment).
*args : tuple, optional
Function call arguments. Must all be expressible as simple literals,
i.e. the ``repr`` must be a valid literal expression.
**kwargs: dict, optional
Function call keyword arguments. All values must be expressible as
simple literals (see ``*args``).
Returns
-------
str
The ``stdout`` from the run.
"""
func_string = dedent(getsource(func_to_run))
func_string = func_string.replace("@staticmethod\n", "")
func_call_term_strings = [repr(arg) for arg in args]
func_call_term_strings += [
f"{name}={repr(val)}" for name, val in kwargs.items()
]
func_call_string = (
f"{func_to_run.__name__}(" + ",".join(func_call_term_strings) + ")"
)
python_string = "\n".join([func_string, func_call_string])
result = run(
[DATA_GEN_PYTHON, "-c", python_string], capture_output=True, check=True
)
return result.stdout
215 changes: 215 additions & 0 deletions benchmarks/benchmarks/generate_data/um_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
# Copyright Iris contributors
#
# This file is part of Iris and is released under the LGPL license.
# See COPYING and COPYING.LESSER in the root of the repository for full
# licensing details.
"""
Generate FF, PP and NetCDF files based on a minimal synthetic FF file.
NOTE: uses the Mule package, so depends on an environment with Mule installed.
"""


def _create_um_files(
len_x: int, len_y: int, len_z: int, len_t: int, compress, save_paths: dict
) -> None:
"""
Generate an FF object of given shape and compression, save to FF/PP/NetCDF.
This is run externally
(:func:`benchmarks.generate_data.run_function_elsewhere`), so all imports
are self-contained and input parameters are simple types.
"""
from copy import deepcopy
from datetime import datetime
from tempfile import NamedTemporaryFile

from mo_pack import compress_wgdos as mo_pack_compress
from mule import ArrayDataProvider, Field3, FieldsFile
from mule.pp import fields_to_pp_file
import numpy as np

from iris import load_cube
from iris import save as save_cube

def packing_patch(*compress_args, **compress_kwargs) -> bytes:
"""
Force conversion from returned :class:`memoryview` to :class:`bytes`.
Downstream uses of :func:`mo_pack.compress_wgdos` were written
for the ``Python2`` behaviour, where the returned buffer had a
different ``__len__`` value to the current :class:`memoryview`.
Unable to fix directly in Mule, so monkey patching for now.
"""
return mo_pack_compress(*compress_args, **compress_kwargs).tobytes()

import mo_pack

mo_pack.compress_wgdos = packing_patch

########

template = {
"fixed_length_header": {"dataset_type": 3, "grid_staggering": 3},
"integer_constants": {
"num_p_levels": len_z,
"num_cols": len_x,
"num_rows": len_y,
},
"real_constants": {},
"level_dependent_constants": {"dims": (len_z + 1, None)},
}
new_ff = FieldsFile.from_template(deepcopy(template))

data_array = np.arange(len_x * len_y).reshape(len_x, len_y)
array_provider = ArrayDataProvider(data_array)

def add_field(level_: int, time_step_: int) -> None:
"""
Add a minimal field to the new :class:`~mule.FieldsFile`.
Includes the minimum information to allow Mule saving and Iris
loading, as well as incrementation for vertical levels and time
steps to allow generation of z and t dimensions.
"""
new_field = Field3.empty()
# To correspond to the header-release 3 class used.
new_field.lbrel = 3
# Mule uses the first element of the lookup to test for
# unpopulated fields (and skips them), so the first element should
# be set to something. The year will do.
new_field.raw[1] = datetime.now().year

# Horizontal.
new_field.lbcode = 1
new_field.lbnpt = len_x
new_field.lbrow = len_y
new_field.bdx = new_ff.real_constants.col_spacing
new_field.bdy = new_ff.real_constants.row_spacing
new_field.bzx = new_ff.real_constants.start_lon - 0.5 * new_field.bdx
new_field.bzy = new_ff.real_constants.start_lat - 0.5 * new_field.bdy

# Hemisphere.
new_field.lbhem = 32
# Processing.
new_field.lbproc = 0

# Vertical.
# Hybrid height values by simulating sequences similar to those in a
# theta file.
new_field.lbvc = 65
if level_ == 0:
new_field.lblev = 9999
else:
new_field.lblev = level_

level_1 = level_ + 1
six_rec = 20 / 3
three_rec = six_rec / 2

new_field.blev = level_1 ** 2 * six_rec - six_rec
new_field.brsvd1 = (
level_1 ** 2 * six_rec + (six_rec * level_1) - three_rec
)

brsvd2_simulated = np.linspace(0.995, 0, len_z)
shift = min(len_z, 2)
bhrlev_simulated = np.concatenate(
[np.ones(shift), brsvd2_simulated[:-shift]]
)
new_field.brsvd2 = brsvd2_simulated[level_]
new_field.bhrlev = bhrlev_simulated[level_]

# Time.
new_field.lbtim = 11

new_field.lbyr = time_step_
for attr_name in ["lbmon", "lbdat", "lbhr", "lbmin", "lbsec"]:
setattr(new_field, attr_name, 0)

new_field.lbyrd = time_step_ + 1
for attr_name in ["lbmond", "lbdatd", "lbhrd", "lbmind", "lbsecd"]:
setattr(new_field, attr_name, 0)

# Data and packing.
new_field.lbuser1 = 1
new_field.lbpack = int(compress)
new_field.bacc = 0
new_field.bmdi = -1
new_field.lbext = 0
new_field.set_data_provider(array_provider)

new_ff.fields.append(new_field)

for time_step in range(len_t):
for level in range(len_z):
add_field(level, time_step + 1)

ff_path = save_paths.get("FF", None)
pp_path = save_paths.get("PP", None)
nc_path = save_paths.get("NetCDF", None)

if ff_path:
new_ff.to_file(ff_path)
if pp_path:
fields_to_pp_file(str(pp_path), new_ff.fields)
if nc_path:
temp_ff_path = None
# Need an Iris Cube from the FF content.
if ff_path:
# Use the existing file.
ff_cube = load_cube(ff_path)
else:
# Make a temporary file.
temp_ff_path = NamedTemporaryFile()
new_ff.to_file(temp_ff_path.name)
ff_cube = load_cube(temp_ff_path.name)

save_cube(ff_cube, nc_path, zlib=compress)
if temp_ff_path:
temp_ff_path.close()


FILE_EXTENSIONS = {"FF": "", "PP": ".pp", "NetCDF": ".nc"}


def create_um_files(
len_x: int,
len_y: int,
len_z: int,
len_t: int,
compress: bool,
file_types: list,
) -> dict:
"""
Generate FF-based FF / PP / NetCDF files with specified shape and compression.
All files representing a given shape are saved in a dedicated directory. A
dictionary of the saved paths is returned.
If the required files exist, they are re-used, unless
:const:`benchmarks.REUSE_DATA` is ``False``.
"""
# Self contained imports to avoid linting confusion with _create_um_files().
from . import BENCHMARK_DATA, REUSE_DATA, run_function_elsewhere

save_name_sections = ["UM", len_x, len_y, len_z, len_t]
save_name = "_".join(str(section) for section in save_name_sections)
save_dir = BENCHMARK_DATA / save_name
if not save_dir.is_dir():
save_dir.mkdir(parents=True)

save_paths = {}
files_exist = True
for file_type in file_types:
file_ext = FILE_EXTENSIONS[file_type]
save_path = (save_dir / f"{compress}").with_suffix(file_ext)
files_exist = files_exist and save_path.is_file()
save_paths[file_type] = str(save_path)

if not REUSE_DATA or not files_exist:
_ = run_function_elsewhere(
_create_um_files, len_x, len_y, len_z, len_t, compress, save_paths
)

return save_paths
Loading

0 comments on commit 15bd351

Please sign in to comment.