Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use NetCDF variable's chunks on load #3131

Merged
merged 3 commits into from
Sep 13, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* NetCDF data variable chunk sizes are utilised at load time for significant performance improvements.
26 changes: 4 additions & 22 deletions lib/iris/fileformats/_pyke_rules/fc_rules_cf.krb
Original file line number Diff line number Diff line change
Expand Up @@ -1070,7 +1070,7 @@ fc_extras
import iris.coord_systems
import iris.fileformats.cf as cf
import iris.fileformats.netcdf
from iris.fileformats.netcdf import parse_cell_methods, UnknownCellMethodWarning
from iris.fileformats.netcdf import _get_cf_var_data, parse_cell_methods, UnknownCellMethodWarning
import iris.fileformats.pp as pp
import iris.exceptions
import iris.std_names
Expand Down Expand Up @@ -1712,25 +1712,16 @@ fc_extras
# Get units
attr_units = get_attr_units(cf_coord_var, attributes)

def cf_var_as_array(cf_var):
dtype = iris.fileformats.netcdf._get_actual_dtype(cf_var)
fill_value = getattr(cf_var.cf_data, '_FillValue',
netCDF4.default_fillvals[dtype.str[1:]])
proxy = iris.fileformats.netcdf.NetCDFDataProxy(
cf_var.shape, dtype, engine.filename,
cf_var.cf_name, fill_value)
return as_lazy_data(proxy)

# Get any coordinate point data.
if isinstance(cf_coord_var, cf.CFLabelVariable):
points_data = cf_coord_var.cf_label_data(cf_var)
else:
points_data = cf_var_as_array(cf_coord_var)
points_data = _get_cf_var_data(cf_coord_var, engine.filename)

# Get any coordinate bounds.
cf_bounds_var = get_cf_bounds_var(cf_coord_var)
if cf_bounds_var is not None:
bounds_data = cf_var_as_array(cf_bounds_var)
bounds_data = _get_cf_var_data(cf_bounds_var, engine.filename)

# Handle transposed bounds where the vertex dimension is not
# the last one. Test based on shape to support different
Expand Down Expand Up @@ -1783,16 +1774,7 @@ fc_extras
# Get units
attr_units = get_attr_units(cf_cm_attr, attributes)

def cf_var_as_array(cf_var):
dtype = cf_var.dtype
fill_value = getattr(cf_var.cf_data, '_FillValue',
netCDF4.default_fillvals[dtype.str[1:]])
proxy = iris.fileformats.netcdf.NetCDFDataProxy(
cf_var.shape, dtype, engine.filename,
cf_var.cf_name, fill_value)
return as_lazy_data(proxy)

data = cf_var_as_array(cf_cm_attr)
data = _get_cf_var_data(cf_cm_attr, engine.filename)

# Determine the name of the dimension/s shared between the CF-netCDF data variable
# and the coordinate being built.
Expand Down
15 changes: 12 additions & 3 deletions lib/iris/fileformats/netcdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,16 +501,25 @@ def _get_actual_dtype(cf_var):
return dummy_data.dtype


def _load_cube(engine, cf, cf_var, filename):
"""Create the cube associated with the CF-netCDF data variable."""
def _get_cf_var_data(cf_var, filename):
# Get lazy chunked data out of a cf variable.
dtype = _get_actual_dtype(cf_var)

# Create cube with deferred data, but no metadata
fill_value = getattr(cf_var.cf_data, '_FillValue',
netCDF4.default_fillvals[cf_var.dtype.str[1:]])
proxy = NetCDFDataProxy(cf_var.shape, dtype, filename, cf_var.cf_name,
fill_value)
data = as_lazy_data(proxy)
chunks = cf_var.cf_data.chunking()
# Chunks can be an iterable, None, or `'contiguous'`.
if chunks == 'contiguous':
chunks = None
return as_lazy_data(proxy, chunks=chunks)


def _load_cube(engine, cf, cf_var, filename):
"""Create the cube associated with the CF-netCDF data variable."""
data = _get_cf_var_data(cf_var, filename)
cube = iris.cube.Cube(data)

# Reset the pyke inference engine.
Expand Down
83 changes: 83 additions & 0 deletions lib/iris/tests/unit/fileformats/netcdf/test__get_cf_var_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# (C) British Crown Copyright 2018, Met Office
#
# This file is part of Iris.
#
# Iris is free software: you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the
# Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Iris is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with Iris. If not, see <http://www.gnu.org/licenses/>.
"""Unit tests for the `iris.fileformats.netcdf._get_cf_var_data` function."""

from __future__ import (absolute_import, division, print_function)
from six.moves import (filter, input, map, range, zip) # noqa

# Import iris.tests first so that some things can be initialised before
# importing anything else.
import iris.tests as tests

from dask.array import Array as dask_array
import numpy as np

from iris._lazy_data import _limited_shape
import iris.fileformats.cf
from iris.fileformats.netcdf import _get_cf_var_data
from iris.tests import mock


class Test__get_cf_var_data(tests.IrisTest):
def setUp(self):
self.filename = 'DUMMY'
self.shape = (3, 240, 200)
self.expected_chunks = _limited_shape(self.shape)

def _make(self, chunksizes):
cf_data = mock.Mock(_FillValue=None)
cf_data.chunking = mock.MagicMock(return_value=chunksizes)
cf_var = mock.MagicMock(spec=iris.fileformats.cf.CFVariable,
dtype=np.dtype('i4'),
cf_data=cf_data,
cf_name='DUMMY_VAR',
shape=self.shape)
return cf_var

def test_cf_data_type(self):
chunks = [1, 12, 100]
cf_var = self._make(chunks)
lazy_data = _get_cf_var_data(cf_var, self.filename)
self.assertIsInstance(lazy_data, dask_array)

def test_cf_data_chunks(self):
chunks = [1, 12, 100]
cf_var = self._make(chunks)
lazy_data = _get_cf_var_data(cf_var, self.filename)
lazy_data_chunks = [c[0] for c in lazy_data.chunks]
self.assertArrayEqual(chunks, lazy_data_chunks)

def test_cf_data_no_chunks(self):
# No chunks means chunks are calculated from the array's shape by
# `iris._lazy_data._limited_shape()`.
chunks = None
cf_var = self._make(chunks)
lazy_data = _get_cf_var_data(cf_var, self.filename)
lazy_data_chunks = [c[0] for c in lazy_data.chunks]
self.assertArrayEqual(lazy_data_chunks, self.expected_chunks)

def test_cf_data_contiguous(self):
# Chunks 'contiguous' is equivalent to no chunks.
chunks = 'contiguous'
cf_var = self._make(chunks)
lazy_data = _get_cf_var_data(cf_var, self.filename)
lazy_data_chunks = [c[0] for c in lazy_data.chunks]
self.assertArrayEqual(lazy_data_chunks, self.expected_chunks)


if __name__ == "__main__":
tests.main()
14 changes: 10 additions & 4 deletions lib/iris/tests/unit/fileformats/netcdf/test__load_cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,22 @@ def setUp(self):

def _make(self, names, attrs):
coords = [DimCoord(i, long_name=name) for i, name in enumerate(names)]
shape = (1,)

cf_group = {}
for name, cf_attrs in zip(names, attrs):
cf_attrs_unused = mock.Mock(return_value=cf_attrs)
cf_group[name] = mock.Mock(cf_attrs_unused=cf_attrs_unused)
cf = mock.Mock(cf_group=cf_group)

cf_data = mock.Mock(_FillValue=None)
cf_data.chunking = mock.MagicMock(return_value=shape)
cf_var = mock.MagicMock(spec=iris.fileformats.cf.CFVariable,
dtype=np.dtype('i4'),
cf_data=mock.Mock(_FillValue=None),
cf_data=cf_data,
cf_name='DUMMY_VAR',
cf_group=coords,
shape=(1,))
shape=shape)
return cf, cf_var

def test_flag_pass_thru(self):
Expand Down Expand Up @@ -129,14 +132,17 @@ def setUp(self):
self.valid_max = mock.sentinel.valid_max

def _make(self, attrs):
shape = (1,)
cf_attrs_unused = mock.Mock(return_value=attrs)
cf_data = mock.Mock(_FillValue=None)
cf_data.chunking = mock.MagicMock(return_value=shape)
cf_var = mock.MagicMock(spec=iris.fileformats.cf.CFVariable,
dtype=np.dtype('i4'),
cf_data=mock.Mock(_FillValue=None),
cf_data=cf_data,
cf_name='DUMMY_VAR',
cf_group=mock.Mock(),
cf_attrs_unused=cf_attrs_unused,
shape=(1,))
shape=shape)
return cf_var

def test_flag_pass_thru(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,13 @@ class TestBoundsVertexDim(tests.IrisTest):
def setUp(self):
# Create coordinate cf variables and pyke engine.
points = np.arange(6).reshape(2, 3)

cf_data = self._make_cf_data(points)
self.cf_coord_var = mock.Mock(
spec=CFVariable,
dimensions=('foo', 'bar'),
cf_name='wibble',
cf_data=mock.Mock(),
cf_data=cf_data,
standard_name=None,
long_name='wibble',
units='m',
Expand All @@ -54,7 +56,8 @@ def setUp(self):

self.engine = mock.Mock(
cube=mock.Mock(),
cf_var=mock.Mock(dimensions=('foo', 'bar')),
cf_var=mock.Mock(dimensions=('foo', 'bar'),
cf_data=cf_data),
filename='DUMMY',
provides=dict(coordinates=[]))

Expand All @@ -72,14 +75,21 @@ def patched__getitem__(proxy_self, keys):
'iris.fileformats.netcdf.NetCDFDataProxy.__getitem__',
new=patched__getitem__)

@staticmethod
def _make_cf_data(vals):
cf_data = mock.Mock(_FillValue=None)
cf_data.chunking = mock.MagicMock(return_value=vals.shape)
return cf_data

def test_slowest_varying_vertex_dim(self):
# Create the bounds cf variable.
bounds = np.arange(24).reshape(4, 2, 3)
cf_data = self._make_cf_data(bounds)
self.cf_bounds_var = mock.Mock(
spec=CFVariable,
dimensions=('nv', 'foo', 'bar'),
cf_name='wibble_bnds',
cf_data=mock.Mock(),
cf_data=cf_data,
shape=bounds.shape,
dtype=bounds.dtype,
__getitem__=lambda self, key: bounds[key])
Expand Down Expand Up @@ -116,11 +126,12 @@ def test_slowest_varying_vertex_dim(self):

def test_fastest_varying_vertex_dim(self):
bounds = np.arange(24).reshape(2, 3, 4)
cf_data = self._make_cf_data(bounds)
self.cf_bounds_var = mock.Mock(
spec=CFVariable,
dimensions=('foo', 'bar', 'nv'),
cf_name='wibble_bnds',
cf_data=mock.Mock(),
cf_data=cf_data,
shape=bounds.shape,
dtype=bounds.dtype,
__getitem__=lambda self, key: bounds[key])
Expand Down Expand Up @@ -155,11 +166,12 @@ def test_fastest_with_different_dim_names(self):
# which are 'foo' and 'bar' (as permitted by the cf spec),
# this should still work because the vertex dim is the fastest varying.
bounds = np.arange(24).reshape(2, 3, 4)
cf_data = self._make_cf_data(bounds)
self.cf_bounds_var = mock.Mock(
spec=CFVariable,
dimensions=('x', 'y', 'nv'),
cf_name='wibble_bnds',
cf_data=mock.Mock(),
cf_data=cf_data,
shape=bounds.shape,
dtype=bounds.dtype,
__getitem__=lambda self, key: bounds[key])
Expand Down Expand Up @@ -194,11 +206,14 @@ class TestDtype(tests.IrisTest):
def setUp(self):
# Create coordinate cf variables and pyke engine.
points = np.arange(6).reshape(2, 3)
cf_data = mock.Mock(_FillValue=None)
cf_data.chunking = mock.MagicMock(return_value=points.shape)

self.cf_coord_var = mock.Mock(
spec=CFVariable,
dimensions=('foo', 'bar'),
cf_name='wibble',
cf_data=mock.Mock(),
cf_data=cf_data,
standard_name=None,
long_name='wibble',
units='m',
Expand Down