Skip to content

Commit 10ec52f

Browse files
authored
feat: add bigframes.bigquery.st_regionstats to join raster data from Earth Engine (#2228)
* feat: add bigframes.bigquery.st_regionstats to join raster data from Earth Engine * upgrade sqlglot * address samples lint error * avoid sqlglot rtrim/ltrim bug
1 parent 5663d2a commit 10ec52f

File tree

16 files changed

+399
-7
lines changed

16 files changed

+399
-7
lines changed

bigframes/bigquery/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
st_intersection,
4141
st_isclosed,
4242
st_length,
43+
st_regionstats,
4344
st_simplify,
4445
)
4546
from bigframes.bigquery._operations.json import (
@@ -81,6 +82,7 @@
8182
st_intersection,
8283
st_isclosed,
8384
st_length,
85+
st_regionstats,
8486
st_simplify,
8587
# json ops
8688
json_extract,

bigframes/bigquery/_operations/geo.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,13 @@
1414

1515
from __future__ import annotations
1616

17-
from typing import Union
17+
import json
18+
from typing import Mapping, Optional, Union
1819

1920
import shapely # type: ignore
2021

2122
from bigframes import operations as ops
23+
import bigframes.dataframe
2224
import bigframes.geopandas
2325
import bigframes.series
2426

@@ -677,6 +679,65 @@ def st_length(
677679
return series
678680

679681

682+
def st_regionstats(
683+
geography: Union[bigframes.series.Series, bigframes.geopandas.GeoSeries],
684+
raster_id: str,
685+
band: Optional[str] = None,
686+
include: Optional[str] = None,
687+
options: Optional[Mapping[str, Union[str, int, float]]] = None,
688+
) -> bigframes.series.Series:
689+
"""Returns statistics summarizing the pixel values of the raster image
690+
referenced by raster_id that intersect with geography.
691+
692+
The statistics include the count, minimum, maximum, sum, standard
693+
deviation, mean, and area of the valid pixels of the raster band named
694+
band_name. Google Earth Engine computes the results of the function call.
695+
696+
See: https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_regionstats
697+
698+
Args:
699+
geography (bigframes.series.Series | bigframes.geopandas.GeoSeries):
700+
A series of geography objects to intersect with the raster image.
701+
raster_id (str):
702+
A string that identifies a raster image. The following formats are
703+
supported. A URI from an image table provided by Google Earth Engine
704+
in BigQuery sharing (formerly Analytics Hub). A URI for a readable
705+
GeoTIFF raster file. A Google Earth Engine asset path that
706+
references public catalog data or project-owned assets with read
707+
access.
708+
band (Optional[str]):
709+
A string in one of the following formats:
710+
A single band within the raster image specified by raster_id. A
711+
formula to compute a value from the available bands in the raster
712+
image. The formula uses the Google Earth Engine image expression
713+
syntax. Bands can be referenced by their name, band_name, in
714+
expressions. If you don't specify a band, the first band of the
715+
image is used.
716+
include (Optional[str]):
717+
An optional string formula that uses the Google Earth Engine image
718+
expression syntax to compute a pixel weight. The formula should
719+
return values from 0 to 1. Values outside this range are set to the
720+
nearest limit, either 0 or 1. A value of 0 means that the pixel is
721+
invalid and it's excluded from analysis. A positive value means that
722+
a pixel is valid. Values between 0 and 1 represent proportional
723+
weights for calculations, such as weighted means.
724+
options (Mapping[str, Union[str, int, float]], optional):
725+
A dictionary of options to pass to the function. See the BigQuery
726+
documentation for a list of available options.
727+
728+
Returns:
729+
bigframes.pandas.Series:
730+
A STRUCT Series containing the computed statistics.
731+
"""
732+
op = ops.GeoStRegionStatsOp(
733+
raster_id=raster_id,
734+
band=band,
735+
include=include,
736+
options=json.dumps(options) if options else None,
737+
)
738+
return geography._apply_unary_op(op)
739+
740+
680741
def st_simplify(
681742
geography: "bigframes.series.Series",
682743
tolerance_meters: float,

bigframes/core/compile/ibis_compiler/operations/geo_ops.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@
1616

1717
from typing import cast
1818

19+
from bigframes_vendored import ibis
1920
from bigframes_vendored.ibis.expr import types as ibis_types
2021
import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes
22+
import bigframes_vendored.ibis.expr.operations.geospatial as ibis_geo
2123
import bigframes_vendored.ibis.expr.operations.udf as ibis_udf
2224

2325
from bigframes.core.compile.ibis_compiler import scalar_op_compiler
@@ -101,6 +103,35 @@ def geo_st_isclosed_op_impl(x: ibis_types.Value):
101103
return st_isclosed(x)
102104

103105

106+
@register_unary_op(ops.GeoStRegionStatsOp, pass_op=True)
107+
def geo_st_regionstats_op_impl(
108+
geography: ibis_types.Value,
109+
op: ops.GeoStRegionStatsOp,
110+
):
111+
if op.band:
112+
band = ibis.literal(op.band, type=ibis_dtypes.string())
113+
else:
114+
band = None
115+
116+
if op.include:
117+
include = ibis.literal(op.include, type=ibis_dtypes.string())
118+
else:
119+
include = None
120+
121+
if op.options:
122+
options = ibis.literal(op.options, type=ibis_dtypes.json())
123+
else:
124+
options = None
125+
126+
return ibis_geo.GeoRegionStats(
127+
arg=geography, # type: ignore
128+
raster_id=ibis.literal(op.raster_id, type=ibis_dtypes.string()), # type: ignore
129+
band=band, # type: ignore
130+
include=include, # type: ignore
131+
options=options, # type: ignore
132+
).to_expr()
133+
134+
104135
@register_unary_op(ops.GeoStSimplifyOp, pass_op=True)
105136
def st_simplify_op_impl(x: ibis_types.Value, op: ops.GeoStSimplifyOp):
106137
x = cast(ibis_types.GeoSpatialValue, x)

bigframes/core/compile/sqlglot/expressions/geo_ops.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,32 @@ def _(expr: TypedExpr, op: ops.GeoStLengthOp) -> sge.Expression:
7474
return sge.func("ST_LENGTH", expr.expr)
7575

7676

77+
@register_unary_op(ops.GeoStRegionStatsOp, pass_op=True)
78+
def _(
79+
geography: TypedExpr,
80+
op: ops.GeoStRegionStatsOp,
81+
):
82+
args = [geography.expr, sge.convert(op.raster_id)]
83+
if op.band:
84+
args.append(sge.Kwarg(this="band", expression=sge.convert(op.band)))
85+
if op.include:
86+
args.append(sge.Kwarg(this="include", expression=sge.convert(op.include)))
87+
if op.options:
88+
args.append(
89+
sge.Kwarg(this="options", expression=sge.JSON(this=sge.convert(op.options)))
90+
)
91+
return sge.func("ST_REGIONSTATS", *args)
92+
93+
94+
@register_unary_op(ops.GeoStSimplifyOp, pass_op=True)
95+
def _(expr: TypedExpr, op: ops.GeoStSimplifyOp) -> sge.Expression:
96+
return sge.func(
97+
"ST_SIMPLIFY",
98+
expr.expr,
99+
sge.convert(op.tolerance_meters),
100+
)
101+
102+
77103
@register_unary_op(ops.geo_x_op)
78104
def _(expr: TypedExpr) -> sge.Expression:
79105
return sge.func("SAFE.ST_X", expr.expr)

bigframes/operations/__init__.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@
121121
GeoStBufferOp,
122122
GeoStDistanceOp,
123123
GeoStLengthOp,
124+
GeoStRegionStatsOp,
124125
GeoStSimplifyOp,
125126
)
126127
from bigframes.operations.json_ops import (
@@ -415,12 +416,13 @@
415416
"geo_st_geogpoint_op",
416417
"geo_st_intersection_op",
417418
"geo_st_isclosed_op",
418-
"GeoStBufferOp",
419-
"GeoStLengthOp",
420-
"GeoStSimplifyOp",
421419
"geo_x_op",
422420
"geo_y_op",
421+
"GeoStBufferOp",
423422
"GeoStDistanceOp",
423+
"GeoStLengthOp",
424+
"GeoStRegionStatsOp",
425+
"GeoStSimplifyOp",
424426
# AI ops
425427
"AIClassify",
426428
"AIGenerate",

bigframes/operations/geo_ops.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
import dataclasses
16+
from typing import Optional
1617

1718
from bigframes import dtypes
1819
from bigframes.operations import base_ops
@@ -135,6 +136,29 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
135136
return dtypes.FLOAT_DTYPE
136137

137138

139+
@dataclasses.dataclass(frozen=True)
140+
class GeoStRegionStatsOp(base_ops.UnaryOp):
141+
"""See: https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_regionstats"""
142+
143+
name = "geo_st_regionstats"
144+
raster_id: str
145+
band: Optional[str]
146+
include: Optional[str]
147+
options: Optional[str]
148+
149+
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
150+
return dtypes.struct_type(
151+
[
152+
("min", dtypes.FLOAT_DTYPE),
153+
("max", dtypes.FLOAT_DTYPE),
154+
("sum", dtypes.FLOAT_DTYPE),
155+
("count", dtypes.INT_DTYPE),
156+
("mean", dtypes.FLOAT_DTYPE),
157+
("area", dtypes.FLOAT_DTYPE),
158+
]
159+
)
160+
161+
138162
@dataclasses.dataclass(frozen=True)
139163
class GeoStSimplifyOp(base_ops.UnaryOp):
140164
name = "st_simplify"
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Code sample for https://docs.cloud.google.com/bigquery/docs/raster-data#analytics-hub-source"""
16+
17+
18+
def test_st_regionstats() -> None:
19+
project_id = "bigframes-dev"
20+
21+
# [START bigquery_dataframes_st_regionstats]
22+
import datetime
23+
from typing import cast
24+
25+
import bigframes.bigquery as bbq
26+
import bigframes.pandas as bpd
27+
28+
# TODO: Set the project_id to your Google Cloud project ID.
29+
# project_id = "your-project-id"
30+
bpd.options.bigquery.project = project_id
31+
32+
# TODO: Set the dataset_id to the ID of the dataset that contains the
33+
# `climate` table. This is likely a linked dataset to Earth Engine.
34+
# See: https://cloud.google.com/bigquery/docs/link-earth-engine
35+
linked_dataset = "era5_land_daily_aggregated"
36+
37+
# For the best efficiency, use partial ordering mode.
38+
bpd.options.bigquery.ordering_mode = "partial"
39+
40+
# Load the table of country boundaries.
41+
countries = bpd.read_gbq("bigquery-public-data.overture_maps.division_area")
42+
43+
# Filter to just the countries.
44+
countries = countries[countries["subtype"] == "country"].copy()
45+
countries["name"] = countries["names"].struct.field("primary")
46+
countries["simplified_geometry"] = bbq.st_simplify(
47+
countries["geometry"],
48+
tolerance_meters=10_000,
49+
)
50+
51+
# Get the reference to the temperature data from a linked dataset.
52+
# Note: This sample assumes you have a linked dataset to Earth Engine.
53+
image_href = (
54+
bpd.read_gbq(f"{project_id}.{linked_dataset}.climate")
55+
.set_index("start_datetime")
56+
.loc[[datetime.datetime(2025, 1, 1, tzinfo=datetime.timezone.utc)], :]
57+
)
58+
raster_id = image_href["assets"].struct.field("image").struct.field("href")
59+
raster_id = raster_id.item()
60+
stats = bbq.st_regionstats(
61+
countries["simplified_geometry"],
62+
raster_id=cast(str, raster_id),
63+
band="temperature_2m",
64+
)
65+
66+
# Extract the mean and convert from Kelvin to Celsius.
67+
countries["mean_temperature"] = stats.struct.field("mean") - 273.15
68+
69+
# Sort by the mean temperature to find the warmest countries.
70+
result = countries[["name", "mean_temperature"]].sort_values(
71+
"mean_temperature", ascending=False
72+
)
73+
print(result.head(10))
74+
# [END bigquery_dataframes_st_regionstats]
75+
76+
assert len(result) > 0
77+
78+
79+
if __name__ == "__main__":
80+
test_st_regionstats()

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@
5454
"pydata-google-auth >=1.8.2",
5555
"requests >=2.27.1",
5656
"shapely >=1.8.5",
57-
"sqlglot >=23.6.3",
57+
# 25.20.0 introduces this fix https://github.com/TobikoData/sqlmesh/issues/3095 for rtrim/ltrim.
58+
"sqlglot >=25.20.0",
5859
"tabulate >=0.9",
5960
"ipywidgets >=7.7.1",
6061
"humanize >=4.6.0",

testing/constraints-3.9.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ pydata-google-auth==1.8.2
2121
requests==2.27.1
2222
scikit-learn==1.2.2
2323
shapely==1.8.5
24-
sqlglot==23.6.3
24+
sqlglot==25.20.0
2525
tabulate==0.9
2626
ipywidgets==7.7.1
2727
humanize==4.6.0
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
*
4+
FROM UNNEST(ARRAY<STRUCT<`bfcol_0` STRING, `bfcol_1` INT64>>[STRUCT('POINT(1 1)', 0)])
5+
), `bfcte_1` AS (
6+
SELECT
7+
*,
8+
ST_REGIONSTATS(
9+
`bfcol_0`,
10+
'ee://some/raster/uri',
11+
band => 'band1',
12+
include => 'some equation',
13+
options => JSON '{"scale": 100}'
14+
) AS `bfcol_2`
15+
FROM `bfcte_0`
16+
), `bfcte_2` AS (
17+
SELECT
18+
*,
19+
`bfcol_2`.`min` AS `bfcol_5`,
20+
`bfcol_2`.`max` AS `bfcol_6`,
21+
`bfcol_2`.`sum` AS `bfcol_7`,
22+
`bfcol_2`.`count` AS `bfcol_8`,
23+
`bfcol_2`.`mean` AS `bfcol_9`,
24+
`bfcol_2`.`area` AS `bfcol_10`
25+
FROM `bfcte_1`
26+
)
27+
SELECT
28+
`bfcol_5` AS `min`,
29+
`bfcol_6` AS `max`,
30+
`bfcol_7` AS `sum`,
31+
`bfcol_8` AS `count`,
32+
`bfcol_9` AS `mean`,
33+
`bfcol_10` AS `area`
34+
FROM `bfcte_2`
35+
ORDER BY
36+
`bfcol_1` ASC NULLS LAST

0 commit comments

Comments
 (0)