Remove support for cluster handling in shredder

mozilla · Feb 11, 2020 · 444c8eb · 444c8eb
1 parent dce350d
commit 444c8eb
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 269 deletions.
diff --git a/GRAVEYARD.md b/GRAVEYARD.md
@@ -14,3 +14,13 @@ history every day, which had started to take on order 1 hour to run. The
 v2 tables instead define a `day_0` view and a `day_13` view and relies on
 the Growth and Usage Dashboard (GUD) to query them separately and join the
 results together at query time.
+
+## Shredder support for per-cluster deletes
+
+- [Removal PR](https://github.com/mozilla/bigquery-etl/pull/733)
+
+For `telemetry_stable.main_v4` shredder used `SELECT` statements over single
+clusters, then combined the result to remove rows from the table. This was an
+attempt to improve performance so that reserved slots would be cheaper than
+on-demand pricing, but it turned out to be slower than using `DELETE`
+statements for whole partitions.
diff --git a/bigquery_etl/shredder/config.py b/bigquery_etl/shredder/config.py
@@ -4,10 +4,6 @@
 
 from dataclasses import dataclass
 from functools import partial
-from itertools import chain
-from typing import Optional, Tuple
-
-from ..util.sql_table_id import sql_table_id
 
 
 SHARED_PROD = "moz-fx-data-shared-prod"
@@ -31,52 +27,18 @@ def dataset_id(self):
         """Dataset Id."""
         return self.table.split(".", 1)[0]
 
-    @property
-    def sql_table_id(self):
-        """Make sql_table_id available as a property for easier templating."""
-        return sql_table_id(self)
-
-
-@dataclass(frozen=True)
-class ClusterCondition:
-    """Data class for cluster condition."""
-
-    condition: str
-    needs_clustering: bool
-
 
 @dataclass(frozen=True)
 class DeleteTarget:
     """Data class for deletion request target.
 
-    Without cluster conditions rows will be removed using either one DELETE
-    statement for the whole table, or one DELETE statement per partition if the
-    table is larger than some configurable threshold.
-
-    When provided cluster conditions are used to divide up deletes into parts
-    smaller than partitions. This is a mitigation specifically for main_v4
-    because it has thousands of sparsely populated columns and partitions in
-    excess of 10TiB, resulting in very slow DELETE performance that could
-    exceed 6 hours for a single partition and makes flat-rate pricing more
-    expensive than on-demand pricing.
-
-    To improve performance vs DELETE operations, cluster conditions can set
-    needs_clustering to False to avoid the overhead of clustering results when
-    the condition identifies a single cluster.
-
-    Each cluster condition is used with a SELECT statement to extract rows from
-    the target table into an intermediate table while filtering out rows with
-    deletion requests. The intermediate tables are then combined using a copy
-    operation to overwrite target table partitions.
-
-    This means that every row must be covered by precisely one cluster
-    condition. Any rows not covered by a cluster condition would be dropped,
-    and any rows covered by multiple conditions would be duplicated.
+    Rows will be removed using either one DELETE statement for the whole table,
+    or one DELETE statement per partition if the table is larger than some
+    configurable threshold.
     """
 
     table: str
     field: str
-    cluster_conditions: Optional[Tuple[ClusterCondition, ...]] = None
     project: str = SHARED_PROD
 
     @property
@@ -89,11 +51,6 @@ def dataset_id(self):
         """Dataset Id."""
         return self.table.split(".", 1)[0]
 
-    @property
-    def sql_table_id(self):
-        """Make sql_table_id available as a property for easier templating."""
-        return sql_table_id(self)
-
 
 CLIENT_ID = "client_id"
 GLEAN_CLIENT_ID = "client_info.client_id"
@@ -196,26 +153,7 @@ def sql_table_id(self):
     client_id_target(table="telemetry_stable.frecency_update_v4"): DESKTOP_SRC,
     client_id_target(table="telemetry_stable.health_v4"): DESKTOP_SRC,
     client_id_target(table="telemetry_stable.heartbeat_v4"): DESKTOP_SRC,
-    client_id_target(
-        table="telemetry_stable.main_v4",
-        cluster_conditions=tuple(
-            ClusterCondition(condition, needs_clustering)
-            for condition, needs_clustering in chain(
-                {
-                    f"sample_id = {sample_id} AND normalized_channel = 'release'": False
-                    for sample_id in range(100)
-                }.items(),
-                [
-                    (
-                        "(sample_id IS NULL "
-                        "OR normalized_channel IS NULL "
-                        "OR normalized_channel != 'release')",
-                        True,
-                    )
-                ],
-            )
-        ),
-    ): DESKTOP_SRC,
+    client_id_target(table="telemetry_stable.main_v4"): DESKTOP_SRC,
     client_id_target(table="telemetry_stable.modules_v4"): DESKTOP_SRC,
     client_id_target(table="telemetry_stable.new_profile_v4"): DESKTOP_SRC,
     client_id_target(table="telemetry_stable.saved_session_v4"): DESKTOP_SRC,