Skip to content

Commit

Permalink
Remove support for cluster handling in shredder
Browse files Browse the repository at this point in the history
  • Loading branch information
relud committed Feb 11, 2020
1 parent dce350d commit 444c8eb
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 269 deletions.
10 changes: 10 additions & 0 deletions GRAVEYARD.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,13 @@ history every day, which had started to take on order 1 hour to run. The
v2 tables instead define a `day_0` view and a `day_13` view and relies on
the Growth and Usage Dashboard (GUD) to query them separately and join the
results together at query time.

## Shredder support for per-cluster deletes

- [Removal PR](https://github.com/mozilla/bigquery-etl/pull/733)

For `telemetry_stable.main_v4` shredder used `SELECT` statements over single
clusters, then combined the result to remove rows from the table. This was an
attempt to improve performance so that reserved slots would be cheaper than
on-demand pricing, but it turned out to be slower than using `DELETE`
statements for whole partitions.
70 changes: 4 additions & 66 deletions bigquery_etl/shredder/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@

from dataclasses import dataclass
from functools import partial
from itertools import chain
from typing import Optional, Tuple

from ..util.sql_table_id import sql_table_id


SHARED_PROD = "moz-fx-data-shared-prod"
Expand All @@ -31,52 +27,18 @@ def dataset_id(self):
"""Dataset Id."""
return self.table.split(".", 1)[0]

@property
def sql_table_id(self):
"""Make sql_table_id available as a property for easier templating."""
return sql_table_id(self)


@dataclass(frozen=True)
class ClusterCondition:
"""Data class for cluster condition."""

condition: str
needs_clustering: bool


@dataclass(frozen=True)
class DeleteTarget:
"""Data class for deletion request target.
Without cluster conditions rows will be removed using either one DELETE
statement for the whole table, or one DELETE statement per partition if the
table is larger than some configurable threshold.
When provided cluster conditions are used to divide up deletes into parts
smaller than partitions. This is a mitigation specifically for main_v4
because it has thousands of sparsely populated columns and partitions in
excess of 10TiB, resulting in very slow DELETE performance that could
exceed 6 hours for a single partition and makes flat-rate pricing more
expensive than on-demand pricing.
To improve performance vs DELETE operations, cluster conditions can set
needs_clustering to False to avoid the overhead of clustering results when
the condition identifies a single cluster.
Each cluster condition is used with a SELECT statement to extract rows from
the target table into an intermediate table while filtering out rows with
deletion requests. The intermediate tables are then combined using a copy
operation to overwrite target table partitions.
This means that every row must be covered by precisely one cluster
condition. Any rows not covered by a cluster condition would be dropped,
and any rows covered by multiple conditions would be duplicated.
Rows will be removed using either one DELETE statement for the whole table,
or one DELETE statement per partition if the table is larger than some
configurable threshold.
"""

table: str
field: str
cluster_conditions: Optional[Tuple[ClusterCondition, ...]] = None
project: str = SHARED_PROD

@property
Expand All @@ -89,11 +51,6 @@ def dataset_id(self):
"""Dataset Id."""
return self.table.split(".", 1)[0]

@property
def sql_table_id(self):
"""Make sql_table_id available as a property for easier templating."""
return sql_table_id(self)


CLIENT_ID = "client_id"
GLEAN_CLIENT_ID = "client_info.client_id"
Expand Down Expand Up @@ -196,26 +153,7 @@ def sql_table_id(self):
client_id_target(table="telemetry_stable.frecency_update_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.health_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.heartbeat_v4"): DESKTOP_SRC,
client_id_target(
table="telemetry_stable.main_v4",
cluster_conditions=tuple(
ClusterCondition(condition, needs_clustering)
for condition, needs_clustering in chain(
{
f"sample_id = {sample_id} AND normalized_channel = 'release'": False
for sample_id in range(100)
}.items(),
[
(
"(sample_id IS NULL "
"OR normalized_channel IS NULL "
"OR normalized_channel != 'release')",
True,
)
],
)
),
): DESKTOP_SRC,
client_id_target(table="telemetry_stable.main_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.modules_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.new_profile_v4"): DESKTOP_SRC,
client_id_target(table="telemetry_stable.saved_session_v4"): DESKTOP_SRC,
Expand Down
Loading

0 comments on commit 444c8eb

Please sign in to comment.