diff --git a/benchmarks/queries/clickbench/README.md b/benchmarks/queries/clickbench/README.md index 6797797409c1..2032427e1ef2 100644 --- a/benchmarks/queries/clickbench/README.md +++ b/benchmarks/queries/clickbench/README.md @@ -93,12 +93,14 @@ LIMIT 10; Results look like +``` +-------------+---------------------+---+------+------+------+ | ClientIP | WatchID | c | tmin | tmed | tmax | +-------------+---------------------+---+------+------+------+ | 1611957945 | 6655575552203051303 | 2 | 0 | 0 | 0 | | -1402644643 | 8566928176839891583 | 2 | 0 | 0 | 0 | +-------------+---------------------+---+------+------+------+ +``` ### Q5: Response start time distribution analysis (p95) @@ -120,13 +122,42 @@ LIMIT 10; ``` Results look like - +``` +-------------+---------------------+---+------+------+------+ | ClientIP | WatchID | c | tmin | tp95 | tmax | +-------------+---------------------+---+------+------+------+ | 1611957945 | 6655575552203051303 | 2 | 0 | 0 | 0 | | -1402644643 | 8566928176839891583 | 2 | 0 | 0 | 0 | +-------------+---------------------+---+------+------+------+ +``` + +### Q6: How many social shares meet complex multi-stage filtering criteria? +**Question**: What is the count of sharing actions from iPhone mobile users on specific social networks, within common timezones, participating in seasonal campaigns, with high screen resolutions and closely matched UTM parameters? +**Important Query Properties**: Simple filter with high-selectivity, Costly string matching, A large number of filters with high overhead are positioned relatively later in the process + +```sql +SELECT COUNT(*) AS ShareCount +FROM hits +WHERE + -- Stage 1: High-selectivity filters (fast) + "IsMobile" = 1 -- Filter mobile users + AND "MobilePhoneModel" LIKE 'iPhone%' -- Match iPhone models + AND "SocialAction" = 'share' -- Identify social sharing actions + + -- Stage 2: Moderate filters (cheap) + AND "SocialSourceNetworkID" IN (5, 12) -- Filter specific social networks + AND "ClientTimeZone" BETWEEN -5 AND 5 -- Restrict to common timezones + + -- Stage 3: Heavy computations (expensive) + AND regexp_match("Referer", '\/campaign\/(spring|summer)_promo') IS NOT NULL -- Find campaign-specific referrers + AND CASE + WHEN split_part(split_part("URL", 'resolution=', 2), '&', 1) ~ '^\d+$' + THEN split_part(split_part("URL", 'resolution=', 2), '&', 1)::INT + ELSE 0 + END > 1920 -- Extract and validate resolution parameter + AND levenshtein("UTMSource", "UTMCampaign") < 3 -- Verify UTM parameter similarity +``` +Result is empty,Since it has already been filtered by `"SocialAction" = 'share'`. ## Data Notes diff --git a/benchmarks/queries/clickbench/extended.sql b/benchmarks/queries/clickbench/extended.sql index fbabaf2a7021..ef3a409c9c02 100644 --- a/benchmarks/queries/clickbench/extended.sql +++ b/benchmarks/queries/clickbench/extended.sql @@ -3,4 +3,5 @@ SELECT COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserCountry"), COUNT(DISTI SELECT "BrowserCountry", COUNT(DISTINCT "SocialNetwork"), COUNT(DISTINCT "HitColor"), COUNT(DISTINCT "BrowserLanguage"), COUNT(DISTINCT "SocialAction") FROM hits GROUP BY 1 ORDER BY 2 DESC LIMIT 10; SELECT "SocialSourceNetworkID", "RegionID", COUNT(*), AVG("Age"), AVG("ParamPrice"), STDDEV("ParamPrice") as s, VAR("ParamPrice") FROM hits GROUP BY "SocialSourceNetworkID", "RegionID" HAVING s IS NOT NULL ORDER BY s DESC LIMIT 10; SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, MEDIAN("ResponseStartTiming") tmed, MAX("ResponseStartTiming") tmax FROM hits WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tmed DESC LIMIT 10; -SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT("ResponseStartTiming", 0.95) tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10; \ No newline at end of file +SELECT "ClientIP", "WatchID", COUNT(*) c, MIN("ResponseStartTiming") tmin, APPROX_PERCENTILE_CONT("ResponseStartTiming", 0.95) tp95, MAX("ResponseStartTiming") tmax FROM 'hits' WHERE "JavaEnable" = 0 GROUP BY "ClientIP", "WatchID" HAVING c > 1 ORDER BY tp95 DESC LIMIT 10; +SELECT COUNT(*) AS ShareCount FROM hits WHERE "IsMobile" = 1 AND "MobilePhoneModel" LIKE 'iPhone%' AND "SocialAction" = 'share' AND "SocialSourceNetworkID" IN (5, 12) AND "ClientTimeZone" BETWEEN -5 AND 5 AND regexp_match("Referer", '\/campaign\/(spring|summer)_promo') IS NOT NULL AND CASE WHEN split_part(split_part("URL", 'resolution=', 2), '&', 1) ~ '^\d+$' THEN split_part(split_part("URL", 'resolution=', 2), '&', 1)::INT ELSE 0 END > 1920 AND levenshtein("UTMSource", "UTMCampaign") < 3; \ No newline at end of file