Skip to content

Commit

Permalink
Caching 2021 queries (#2254)
Browse files Browse the repository at this point in the history
* Update README.md

* Top CDNs

* caching queries part 1

* LH queries

* caching sql part 2

* changes from review

* comment
  • Loading branch information
rviscomi authored Sep 24, 2021
1 parent 360fcae commit f2dec20
Show file tree
Hide file tree
Showing 25 changed files with 827 additions and 7 deletions.
12 changes: 5 additions & 7 deletions sql/2021/caching/README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
# 2021 Caching queries

<!--
This directory contains all of the 2021 Caching chapter queries.
Resources:

Each query should have a corresponding `metric_name.sql` file.
Note that readers are linked to this directory, so try to make the SQL file names descriptive for easy browsing.
Analysts: if helpful, you can use this README to give additional info about the queries.
-->
- [Planning doc](https://docs.google.com/document/d/1BSdljOv86CPAP5T0A9jcV6bU7NJp1f2mTFa4ZZ7xxe8/edit)
- [Results sheet](https://docs.google.com/spreadsheets/d/1-v3yR0LZIC3t4zWtqTgR3jJsKjjRMP-HATU2caP8e2c/edit)
- [2019 chapter](https://almanac.httparchive.org/en/2019/caching)
- [2020 chapter](https://almanac.httparchive.org/en/2020/caching)
71 changes: 71 additions & 0 deletions sql/2021/caching/cache_control_directives.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#standardSQL
# Use of Cache-Control directives
SELECT
client,
COUNT(0) AS total_requests,
COUNTIF(uses_cache_control) AS total_using_cache_control,
COUNTIF(uses_max_age) AS total_using_max_age,
COUNTIF(uses_no_cache) AS total_using_no_cache,
COUNTIF(uses_public) AS total_using_public,
COUNTIF(uses_must_revalidate) AS total_using_must_revalidate,
COUNTIF(uses_no_store) AS total_using_no_store,
COUNTIF(uses_private) AS total_using_private,
COUNTIF(uses_proxy_revalidate) AS total_using_proxy_revalidate,
COUNTIF(uses_s_maxage) AS total_using_s_maxage,
COUNTIF(uses_no_transform) AS total_using_no_transform,
COUNTIF(uses_immutable) AS total_using_immutable,
COUNTIF(uses_stale_while_revalidate) AS total_using_stale_while_revalidate,
COUNTIF(uses_stale_if_error) AS total_using_stale_if_error,
COUNTIF(uses_no_store AND uses_no_cache AND uses_max_age_zero) AS total_using_no_store_and_no_cache_and_max_age_zero,
COUNTIF(uses_no_store AND uses_no_cache AND NOT uses_max_age_zero) AS total_using_no_store_and_no_cache_only,
COUNTIF(uses_no_store AND NOT uses_no_cache AND NOT uses_max_age_zero) AS total_using_no_store_only,
COUNTIF(uses_max_age_zero AND NOT uses_no_store) AS total_using_max_age_zero_without_no_store,
COUNTIF(uses_pre_check_zero AND uses_post_check_zero) AS total_using_pre_check_zero_and_post_check_zero,
COUNTIF(uses_pre_check_zero) AS total_using_pre_check_zero,
COUNTIF(uses_post_check_zero) AS total_using_post_check_zero,
COUNTIF(uses_cache_control AND NOT uses_max_age AND NOT uses_no_cache AND NOT uses_public AND NOT uses_must_revalidate AND NOT uses_no_store AND NOT uses_private AND NOT uses_proxy_revalidate AND NOT uses_s_maxage AND NOT uses_no_transform AND NOT uses_immutable AND NOT uses_stale_while_revalidate AND NOT uses_stale_if_error AND NOT uses_pre_check_zero AND NOT uses_post_check_zero) AS total_erroneous_directives,
COUNTIF(uses_cache_control) / COUNT(0) AS pct_using_cache_control,
COUNTIF(uses_max_age) / COUNT(0) AS pct_using_max_age,
COUNTIF(uses_no_cache) / COUNT(0) AS pct_using_no_cache,
COUNTIF(uses_public) / COUNT(0) AS pct_using_public,
COUNTIF(uses_must_revalidate) / COUNT(0) AS pct_using_must_revalidate,
COUNTIF(uses_no_store) / COUNT(0) AS pct_using_no_store,
COUNTIF(uses_private) / COUNT(0) AS pct_using_private,
COUNTIF(uses_proxy_revalidate) / COUNT(0) AS pct_using_proxy_revalidate,
COUNTIF(uses_s_maxage) / COUNT(0) AS pct_using_s_maxage,
COUNTIF(uses_no_transform) / COUNT(0) AS pct_using_no_transform,
COUNTIF(uses_immutable) / COUNT(0) AS pct_using_immutable,
COUNTIF(uses_stale_while_revalidate) / COUNT(0) AS pct_using_stale_while_revalidate,
COUNTIF(uses_stale_if_error) / COUNT(0) AS pct_using_stale_if_error,
COUNTIF(uses_no_store AND uses_no_cache AND uses_max_age_zero) / COUNT(0) AS pct_using_no_store_and_no_cache_and_max_age_zero,
COUNTIF(uses_no_store AND uses_no_cache AND NOT uses_max_age_zero) / COUNT(0) AS pct_using_no_store_and_no_cache_only,
COUNTIF(uses_no_store AND NOT uses_no_cache AND NOT uses_max_age_zero) / COUNT(0) AS pct_using_no_store_only,
COUNTIF(uses_max_age_zero AND NOT uses_no_store) / COUNT(0) AS pct_using_max_age_zero_without_no_store,
COUNTIF(uses_pre_check_zero AND uses_post_check_zero) / COUNT(0) AS pct_using_pre_check_zero_and_post_check_zero,
COUNTIF(uses_pre_check_zero) / COUNT(0) AS pct_using_pre_check_zero,
COUNTIF(uses_post_check_zero) / COUNT(0) AS pct_using_post_check_zero,
COUNTIF(uses_cache_control AND NOT uses_max_age AND NOT uses_no_cache AND NOT uses_public AND NOT uses_must_revalidate AND NOT uses_no_store AND NOT uses_private AND NOT uses_proxy_revalidate AND NOT uses_s_maxage AND NOT uses_no_transform AND NOT uses_immutable AND NOT uses_stale_while_revalidate AND NOT uses_stale_if_error AND NOT uses_pre_check_zero AND NOT uses_post_check_zero) / COUNT(0) AS pct_erroneous_directives
FROM (
SELECT
_TABLE_SUFFIX AS client,
TRIM(resp_cache_control) != "" AS uses_cache_control,
REGEXP_CONTAINS(resp_cache_control, r'(?i)max-age\s*=\s*[0-9]+') AS uses_max_age,
REGEXP_CONTAINS(resp_cache_control, r'(?i)max-age\s*=\s*0') AS uses_max_age_zero,
REGEXP_CONTAINS(resp_cache_control, r'(?i)public') AS uses_public,
REGEXP_CONTAINS(resp_cache_control, r'(?i)no-cache') AS uses_no_cache,
REGEXP_CONTAINS(resp_cache_control, r'(?i)must-revalidate') AS uses_must_revalidate,
REGEXP_CONTAINS(resp_cache_control, r'(?i)no-store') AS uses_no_store,
REGEXP_CONTAINS(resp_cache_control, r'(?i)private') AS uses_private,
REGEXP_CONTAINS(resp_cache_control, r'(?i)proxy-revalidate') AS uses_proxy_revalidate,
REGEXP_CONTAINS(resp_cache_control, r'(?i)s-maxage\s*=\s*[0-9]+') AS uses_s_maxage,
REGEXP_CONTAINS(resp_cache_control, r'(?i)no-transform') AS uses_no_transform,
REGEXP_CONTAINS(resp_cache_control, r'(?i)immutable') AS uses_immutable,
REGEXP_CONTAINS(resp_cache_control, r'(?i)stale-while-revalidate\s*=\s*[0-9]+') AS uses_stale_while_revalidate,
REGEXP_CONTAINS(resp_cache_control, r'(?i)stale-if-error\s*=\s*[0-9]+') AS uses_stale_if_error,
REGEXP_CONTAINS(resp_cache_control, r'(?i)pre-check\s*=\s*0') AS uses_pre_check_zero,
REGEXP_CONTAINS(resp_cache_control, r'(?i)post-check\s*=\s*0') AS uses_post_check_zero
FROM
`httparchive.summary_requests.2021_07_01_*`
)
GROUP BY
client
33 changes: 33 additions & 0 deletions sql/2021/caching/cache_control_directives_invalid.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#standardSQL
# List of invalid Cache-Control directive names.
SELECT
client,
total_directives,
total_using_cache_control,
directive_name,
directive_occurrences,
directive_occurrences / total_using_cache_control AS pct_of_cache_control,
directive_occurrences / total_directives AS pct_of_total_directives
FROM (
SELECT
_TABLE_SUFFIX AS client,
directive_name,
COUNT(0) AS directive_occurrences,
SUM(COUNT(0)) OVER (PARTITION BY _TABLE_SUFFIX) AS total_directives,
SUM(COUNTIF(TRIM(resp_cache_control) != '')) OVER (PARTITION BY _TABLE_SUFFIX) AS total_using_cache_control
FROM
`httparchive.summary_requests.2021_07_01_*`
LEFT JOIN
UNNEST(REGEXP_EXTRACT_ALL(LOWER(resp_cache_control), r'([a-z][^,\s="\']*)')) AS directive_name
GROUP BY
client,
directive_name)
WHERE
directive_name NOT IN (
'max-age', 'public', 'no-cache', 'must-revalidate', 'no-store',
'private', 'proxy-revalidate', 's-maxage', 'no-transform',
'immutable', 'stale-while-revalidate', 'stale-if-error',
'pre-check', 'post-check')
ORDER BY
client,
directive_occurrences DESC
32 changes: 32 additions & 0 deletions sql/2021/caching/cache_ttl_and_content_age_diff.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#standardSQL
# Difference between Cache TTL and the contents age
CREATE TEMPORARY FUNCTION toTimestamp(date_string STRING)
RETURNS INT64 LANGUAGE js AS '''
try {
var timestamp = Math.round(new Date(date_string).getTime() / 1000);
return isNaN(timestamp) || timestamp < 0 ? -1 : timestamp;
} catch (e) {
return null;
}
''';

SELECT
client,
percentile,
APPROX_QUANTILES(diff_in_days, 1000 IGNORE NULLS)[OFFSET(percentile * 10)] AS diff_in_days
FROM (
SELECT
_TABLE_SUFFIX AS client,
ROUND((expAge - (startedDateTime - toTimestamp(resp_last_modified))) / (60 * 60 * 24), 2) AS diff_in_days
FROM
`httparchive.summary_requests.2021_07_01_*`
WHERE
resp_last_modified <> "" AND
expAge > 0),
UNNEST([10, 25, 50, 75, 90]) AS percentile
GROUP BY
client,
percentile
ORDER BY
client,
percentile
16 changes: 16 additions & 0 deletions sql/2021/caching/cache_ttl_lighthouse_score.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#standardSQL
# Distribution of cache TTL Lighthouse scores
SELECT
_TABLE_SUFFIX AS client,
JSON_EXTRACT_SCALAR(report, "$.audits.uses-long-cache-ttl.score") AS caching_score,
COUNT(0) AS num_pages,
SUM(COUNT(0)) OVER (PARTITION BY _TABLE_SUFFIX) AS total,
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY _TABLE_SUFFIX) AS pct_pages
FROM
`httparchive.lighthouse.2021_07_01_*`
GROUP BY
client,
caching_score
ORDER BY
client,
caching_score ASC
16 changes: 16 additions & 0 deletions sql/2021/caching/cache_wastedbytes_lighthouse.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#standardSQL
# Distribution of bytes wasted (absence of adequate caching) from Lighthouse
SELECT
_TABLE_SUFFIX AS client,
ROUND(CAST(JSON_EXTRACT_SCALAR(report, "$.audits.uses-long-cache-ttl.details.summary.wastedBytes") AS NUMERIC) / 1024 / 1024) AS mbyte_savings,
COUNT(0) AS num_pages,
SUM(COUNT(0)) OVER (PARTITION BY _TABLE_SUFFIX) AS total,
COUNT(0) / SUM(COUNT(0)) OVER (PARTITION BY _TABLE_SUFFIX) AS pct_pages
FROM
`httparchive.lighthouse.2021_07_01_*`
GROUP BY
client,
mbyte_savings
ORDER BY
client,
mbyte_savings ASC
30 changes: 30 additions & 0 deletions sql/2021/caching/content_age_older_than_ttl.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#standardSQL
# Requests with a content age older than its TTL
CREATE TEMPORARY FUNCTION toTimestamp(date_string STRING)
RETURNS INT64 LANGUAGE js AS '''
try {
var timestamp = Math.round(new Date(date_string).getTime() / 1000);
return isNaN(timestamp) || timestamp < 0 ? -1 : timestamp;
} catch (e) {
return null;
}
''';

SELECT
client,
COUNT(0) AS total_req,
COUNTIF(diff < 0) AS req_too_short_cache,
COUNTIF(diff < 0) / COUNT(0) AS perc_req_too_short_cache
FROM (
SELECT
_TABLE_SUFFIX AS client,
expAge - (startedDateTime - toTimestamp(resp_last_modified)) AS diff
FROM
`httparchive.summary_requests.2021_07_01_*`
WHERE
resp_last_modified <> "" AND
expAge > 0)
GROUP BY
client
ORDER BY
client
36 changes: 36 additions & 0 deletions sql/2021/caching/content_age_older_than_ttl_by_party.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#standardSQL
# Difference between Cache TTL and the content age for third party request
CREATE TEMPORARY FUNCTION toTimestamp(date_string STRING)
RETURNS INT64 LANGUAGE js AS '''
try {
var timestamp = Math.round(new Date(date_string).getTime() / 1000);
return isNaN(timestamp) || timestamp < 0 ? -1 : timestamp;
} catch (e) {
return null;
}
''';

SELECT
client,
party,
COUNT(0) AS total_req,
COUNTIF(diff < 0) AS req_too_short_cache,
COUNTIF(diff < 0) / COUNT(0) AS perc_req_too_short_cache
FROM (
SELECT
_TABLE_SUFFIX AS client,
IF(NET.HOST(url) IN (
SELECT domain FROM `httparchive.almanac.third_parties` WHERE date = '2021-07-01' AND category != 'hosting'
), 'third party', 'first party') AS party,
requests.expAge - (requests.startedDateTime - toTimestamp(requests.resp_last_modified)) AS diff
FROM
`httparchive.summary_requests.2021_07_01_*` requests
WHERE
TRIM(requests.resp_last_modified) <> "" AND
expAge > 0)
GROUP BY
client,
party
ORDER BY
client,
party
60 changes: 60 additions & 0 deletions sql/2021/caching/cookie_attributes.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#standardSQL
# Popularity of top Set-Cookie attributes/directives
CREATE TEMPORARY FUNCTION getCookieAttributes(headers STRING)
RETURNS ARRAY<STRING> DETERMINISTIC LANGUAGE js AS '''
try {
var $ = JSON.parse(headers);
return $.filter(header => {
return header.name.toLowerCase() == 'set-cookie';
}).flatMap(header => {
return Array.from(new Set(header.value.split(';').slice(1).map(attr => {
return attr.trim().split('=')[0].trim();
})));
});
} catch (e) {
return [];
}
''';

CREATE TEMPORARY FUNCTION countCookies(headers STRING)
RETURNS INT64 DETERMINISTIC LANGUAGE js AS '''
try {
var $ = JSON.parse(headers);
return $.filter(header => {
return header.name.toLowerCase() == 'set-cookie';
}).length;
} catch (e) {
return 0;
}
''';

SELECT
client,
attr.value AS attr,
attr.count AS freq,
total,
attr.count / total AS pct
FROM (
SELECT
client,
APPROX_TOP_COUNT(attr, 100) AS attrs
FROM
`httparchive.almanac.requests`,
UNNEST(getCookieAttributes(response_headers)) AS attr
WHERE
date = '2021-07-01'
GROUP BY
client)
JOIN (
SELECT
client,
SUM(countCookies(response_headers)) AS total
FROM
`httparchive.almanac.requests`
GROUP BY
client)
USING
(client),
UNNEST(attrs) AS attr
ORDER BY
pct DESC
37 changes: 37 additions & 0 deletions sql/2021/caching/cookie_names.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#standardSQL
# Popularity of top Set-Cookie names
CREATE TEMPORARY FUNCTION getCookies(headers STRING)
RETURNS ARRAY<STRING> DETERMINISTIC LANGUAGE js AS '''
try {
var $ = JSON.parse(headers);
return $.filter(header => {
return header.name.toLowerCase() == 'set-cookie';
}).map(header => {
return header.value.split('=')[0].trim();
});
} catch (e) {
return [];
}
''';

SELECT
client,
cookie.value AS cookie,
cookie.count AS freq,
total,
cookie.count / total AS pct
FROM (
SELECT
client,
APPROX_TOP_COUNT(cookie, 100) AS cookies,
COUNT(0) AS total
FROM
`httparchive.almanac.requests`,
UNNEST(getCookies(response_headers)) AS cookie
WHERE
date = '2021-07-01'
GROUP BY
client),
UNNEST(cookies) AS cookie
ORDER BY
pct DESC
Loading

0 comments on commit f2dec20

Please sign in to comment.