Skip to content

Commit

Permalink
Have Earth Engine calculate coordinate information.
Browse files Browse the repository at this point in the history
Fixes #57. Per the suggestion of @raspstephan, we avoid computing lat/lng information ourselves and instead pull our coordinate information via RPC calls into EE. There didn't seem to be a good way to do this within `get_info()`, so this introduces two additional IO calls. This should only add overhead to the `open_dataset()` call (i.e. once) and not to every data index operation.

Running benchmarks now, so far:
```
open_dataset():avg=33.11,std=8.23,best=22.81,worst=48.00
open_and_chunk():avg=37.61,std=8.79,best=23.95,worst=53.59
```

PiperOrigin-RevId: 572310584
  • Loading branch information
Xee authors committed Oct 10, 2023
1 parent 4a6f7d8 commit 74ec35b
Show file tree
Hide file tree
Showing 2 changed files with 120 additions and 107 deletions.
218 changes: 114 additions & 104 deletions xee/ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,13 @@
}


class _GetComputedPixels:
"""Wrapper around `ee.data.computePixels()` to make retries simple."""

def __getitem__(self, params) -> np.ndarray:
return ee.data.computePixels(params)


class EarthEngineStore(common.AbstractDataStore):
"""Read-only Data Store for Google Earth Engine."""

Expand Down Expand Up @@ -205,12 +212,12 @@ def __init__(
)
# We add and subtract the scale to solve an off-by-one error. With this
# adjustment, we achieve parity with a pure `computePixels()` call.
x_min, y_min = self.project(x_min_0 - self.scale_x, y_min_0)
x_min, y_min = self.transform(x_min_0 - self.scale_x, y_min_0)
if _bounds_are_invalid(x_min, y_min, self.scale_units == 'degree'):
x_min, y_min = self.project(x_min_0, y_min_0)
x_max, y_max = self.project(x_max_0, y_max_0 + self.scale_y)
x_min, y_min = self.transform(x_min_0, y_min_0)
x_max, y_max = self.transform(x_max_0, y_max_0 + self.scale_y)
if _bounds_are_invalid(x_max, y_max, self.scale_units == 'degree'):
x_max, y_max = self.project(x_max_0, y_max_0)
x_max, y_max = self.transform(x_max_0, y_max_0)
self.bounds = x_min, y_min, x_max, y_max

self.chunks = self.PREFERRED_CHUNKS.copy()
Expand Down Expand Up @@ -318,12 +325,96 @@ def _assign_preferred_chunks(self) -> Chunks:
chunks[y_dim_name] = self.chunks['height']
return chunks

def project(self, xs: float, ys: float) -> tuple[float, float]:
def transform(self, xs: float, ys: float) -> tuple[float, float]:
transformer = pyproj.Transformer.from_crs(
self.crs.geodetic_crs, self.crs, always_xy=True
)
return transformer.transform(xs, ys)

def project(self, bbox: types.BBox) -> types.Grid:
"""Translate a bounding box (pixel space) to a grid (projection space).
Here, we calculate a simple affine transformation to get a specific region
when computing pixels.
Args:
bbox: Bounding box in pixel space.
Returns:
A Grid, to be passed into `computePixels()`'s "grid" keyword. Defines the
appropriate region of data to return according to the Array's configured
projection and scale.
"""
# The origin of the image is in the top left corner. X is the minimum value
# and Y is the maximum value.
x_origin, _, _, y_origin = self.bounds # x_min, x_max, y_min, y_max
x_start, y_start, x_end, y_end = bbox
width = x_end - x_start
height = y_end - y_start

return {
# The size of the bounding box. The affine transform and project will be
# applied, so we can think in terms of pixels.
'dimensions': {
'width': width,
'height': height,
},
'affineTransform': {
# Since the origin is in the top left corner, we want to translate
# the start of the grid to the positive direction for X and the
# negative direction for Y.
'translateX': x_origin + self.scale_x * x_start,
'translateY': y_origin + self.scale_y * y_start,
# Define the scale for each pixel (e.g. the number of meters between
# each value).
'scaleX': self.scale_x,
'scaleY': self.scale_y,
},
'crsCode': self.crs_arg,
}

def image_to_array(
self,
image: ee.Image,
pixels_getter=_GetComputedPixels(),
dtype=np.float32,
**kwargs,
) -> np.ndarray:
"""Gets the pixels for a given image as a numpy array.
This method includes exponential backoff (with jitter) when trying to get
pixel data.
Args:
image: An EE image.
pixels_getter: An object whose `__getitem__()` method calls
`computePixels()`.
dtype: a np.dtype. The returned array will be in this dtype.
**kwargs: Additional settings for `params` in `computePixels(params)`. For
example, a `grid` dictionary.
Returns:
An numpy array containing the pixels computed based on the given image.
"""
image = image.unmask(self.mask_value)
params = {
'expression': image,
'fileFormat': 'NUMPY_NDARRAY',
**kwargs,
}
raw = common.robust_getitem(
pixels_getter, params, catch=ee.ee_exception.EEException
)

# TODO(#9): Find a way to make this more efficient. This is needed because
# `raw` is a structured array of all the same dtype (i.e. number of images).
arr = np.array(raw.tolist(), dtype=dtype)
data = arr.T

# Sets EE nodata masked value to NaNs.
data = np.where(data == self.mask_value, np.nan, data)
return data

@functools.lru_cache()
def _band_attrs(self, band_name: str) -> types.BandInfo:
try:
Expand Down Expand Up @@ -403,9 +494,17 @@ def get_variables(self) -> utils.Frozen[str, xarray.Variable]:
f'ImageCollection due to: {e}.'
)

x_min_0, y_min_0, x_max_0, y_max_0 = self.bounds
width_coord = np.linspace(x_min_0, x_max_0, v0.shape[1])
height_coord = np.linspace(y_max_0, y_min_0, v0.shape[2])
lnglat_img = ee.Image.pixelLonLat()
lon_grid = self.project((0, 0, v0.shape[1], 1))
lat_grid = self.project((0, 0, 1, v0.shape[2]))
lon = self.image_to_array(
lnglat_img, grid=lon_grid, dtype=np.float32, bandIds=['longitude']
)
lat = self.image_to_array(
lnglat_img, grid=lat_grid, dtype=np.float32, bandIds=['latitude']
)
width_coord = np.squeeze(lon)
height_coord = np.squeeze(lat)

x_dim_name, y_dim_name = self.dimension_names

Expand Down Expand Up @@ -478,25 +577,14 @@ def geometry_to_bounds(geom: ee.Geometry) -> types.Bounds:
return _ee_bounds_to_bounds(bounds)


class _GetComputedPixels:
"""Wrapper around `ee.data.computePixels()` to make retries simple."""

def __getitem__(self, params) -> np.ndarray:
return ee.data.computePixels(params)


class EarthEngineBackendArray(backends.BackendArray):
"""Array backend for Earth Engine."""

def __init__(self, variable_name: str, ee_store: EarthEngineStore):
self.variable_name = variable_name
self.store = ee_store

self.scale_x = ee_store.scale_x
self.scale_y = ee_store.scale_y
self.scale = ee_store.scale
self.crs_arg = ee_store.crs_arg
self.crs = ee_store.crs
self.bounds = ee_store.bounds

# It looks like different bands have different dimensions & transforms!
Expand All @@ -506,51 +594,14 @@ def __init__(self, variable_name: str, ee_store: EarthEngineStore):

x_min, y_min, x_max, y_max = self.bounds

x_size = int(np.ceil((x_max - x_min) / np.abs(self.scale_x)))
y_size = int(np.ceil((y_max - y_min) / np.abs(self.scale_y)))
x_size = int(np.ceil((x_max - x_min) / np.abs(self.store.scale_x)))
y_size = int(np.ceil((y_max - y_min) / np.abs(self.store.scale_y)))

self.shape = (ee_store.n_images, x_size, y_size)
self._apparent_chunks = {k: 1 for k in self.store.PREFERRED_CHUNKS.keys()}
if isinstance(self.store.chunks, dict):
self._apparent_chunks = self.store.chunks.copy()

def _to_array(
self, image: ee.Image, pixels_getter=_GetComputedPixels(), **kwargs
) -> np.ndarray:
"""Gets the pixels for a given image as a numpy array.
This method includes exponential backoff (with jitter) when trying to get
pixel data.
Args:
image: An EE image.
pixels_getter: An object whose `__getitem__()` method calls
`computePixels()`.
**kwargs: Additional settings for `params` in `computePixels(params)`. For
example, a `grid` dictionary.
Returns:
An numpy array containing the pixels computed based on the given image.
"""
image = image.unmask(self.store.mask_value)
params = {
'expression': image,
'fileFormat': 'NUMPY_NDARRAY',
**kwargs,
}
raw = common.robust_getitem(
pixels_getter, params, catch=ee.ee_exception.EEException
)

# TODO(#9): Find a way to make this more efficient. This is needed because
# `raw` is a structured array of all the same dtype (i.e. number of images).
arr = np.array(raw.tolist(), dtype=self.dtype)
data = arr.T

# Sets EE nodata masked value to NaNs.
data = np.where(data == self.store.mask_value, np.nan, data)
return data

def __getitem__(self, key: indexing.ExplicitIndexer) -> np.typing.ArrayLike:
return indexing.explicit_indexing_adapter(
key,
Expand All @@ -559,48 +610,6 @@ def __getitem__(self, key: indexing.ExplicitIndexer) -> np.typing.ArrayLike:
self._raw_indexing_method,
)

def _project(self, bbox: types.BBox) -> types.Grid:
"""Translate a bounding box (pixel space) to a grid (projection space).
Here, we calculate a simple affine transformation to get a specific region
when computing pixels.
Args:
bbox: Bounding box in pixel space.
Returns:
A Grid, to be passed into `computePixels()`'s "grid" keyword. Defines the
appropriate region of data to return according to the Array's configured
projection and scale.
"""
# The origin of the image is in the top left corner. X is the minimum value
# and Y is the maximum value.
x_origin, _, _, y_origin = self.bounds # x_min, x_max, y_min, y_max
x_start, y_start, x_end, y_end = bbox
width = x_end - x_start
height = y_end - y_start

return {
# The size of the bounding box. The affine transform and project will be
# applied, so we can think in terms of pixels.
'dimensions': {
'width': width,
'height': height,
},
'affineTransform': {
# Since the origin is in the top left corner, we want to translate
# the start of the grid to the positive direction for X and the
# negative direction for Y.
'translateX': x_origin + self.scale_x * x_start,
'translateY': y_origin + self.scale_y * y_start,
# Define the scale for each pixel (e.g. the number of meters between
# each value).
'scaleX': self.scale_x,
'scaleY': self.scale_y,
},
'crsCode': self.crs_arg,
}

def _key_to_slices(
self, key: tuple[Union[int, slice], ...]
) -> tuple[tuple[slice, ...], tuple[int, ...]]:
Expand Down Expand Up @@ -676,7 +685,9 @@ def _raw_indexing_method(
# User does not want to use any chunks...
if self.store.chunks == -1:
target_image = self._slice_collection(key[0])
out = self._to_array(target_image, grid=self._project(bbox))
out = self.store.image_to_array(
target_image, grid=self.store.project(bbox), dtype=self.dtype
)

if squeeze_axes:
out = np.squeeze(out, squeeze_axes)
Expand Down Expand Up @@ -728,9 +739,8 @@ def _make_tile(
"""Get a numpy array from EE for a specific 3D bounding box (a 'tile')."""
tile_idx, (istart, iend, *bbox) = tile_index
target_image = self._slice_collection(slice(istart, iend))
return tile_idx, self._to_array(
target_image, grid=self._project(tuple(bbox))
)
return tile_idx, self.store.image_to_array(
target_image, grid=self.store.project(tuple(bbox)), dtype=self.dtype)

def _tile_indexes(
self, index_range: slice, bbox: types.BBox
Expand Down
9 changes: 6 additions & 3 deletions xee/ext_integration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,10 +216,13 @@ def __getitem__(self, params):
return ee.data.computePixels(params)

arr = xee.EarthEngineBackendArray('B5', self.store)
grid = arr._project((0, 10, 0, 10))
grid = self.store.project((0, 10, 0, 10))
getter = ErroneousPixelsGetter()
arr._to_array(
self.store.image_collection.first(), pixels_getter=getter, grid=grid
self.store.image_to_array(
self.store.image_collection.first(),
pixels_getter=getter,
grid=grid,
dtype=arr.dtype,
)

self.assertEqual(getter.count, 3)
Expand Down

0 comments on commit 74ec35b

Please sign in to comment.