From 9ef2a4cbda3d238cb4ee044c31d79924fa436842 Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Wed, 4 Oct 2023 16:33:22 -0700 Subject: [PATCH 1/9] Add query to count content-types used for WordPress pages --- sql/2023/10/page-content-types.sql | 59 ++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 sql/2023/10/page-content-types.sql diff --git a/sql/2023/10/page-content-types.sql b/sql/2023/10/page-content-types.sql new file mode 100644 index 0000000..90c5033 --- /dev/null +++ b/sql/2023/10/page-content-types.sql @@ -0,0 +1,59 @@ +# HTTP Archive query to get counts of content-types used for WordPress pages. +# +# WPP Research, Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# See query results here: ... +WITH + + pages AS ( + SELECT + page + FROM + `httparchive.all.pages`, + UNNEST(technologies) AS t + WHERE + date = '2023-08-01' AND + is_root_page AND + t.technology = 'WordPress' + ), + + # h/t https://discuss.httparchive.org/t/help-finding-list-of-home-pages-with-specific-http-response-header/2567/2 + requests AS ( + SELECT + url, + REGEXP_REPLACE( resp_headers.value, ' *;.*$', '' ) AS content_type + FROM + `httparchive.all.requests`, + UNNEST(response_headers) as resp_headers + WHERE + date = "2023-08-01" AND + lower(resp_headers.name) = 'content-type' AND + is_main_document AND + root_page = url + ) + +SELECT + content_type, + COUNT(content_type) AS count +FROM + requests +JOIN + pages +ON + pages.page = requests.url +GROUP BY + content_type +ORDER BY + count DESC From 3530bda01a0c62543a0ec02c4c9d20d66ed2a46e Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Wed, 4 Oct 2023 16:57:03 -0700 Subject: [PATCH 2/9] Add PR reference and add to index --- sql/2023/10/page-content-types.sql | 2 +- sql/README.md | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/sql/2023/10/page-content-types.sql b/sql/2023/10/page-content-types.sql index 90c5033..dd40943 100644 --- a/sql/2023/10/page-content-types.sql +++ b/sql/2023/10/page-content-types.sql @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# See query results here: ... +# See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/74 WITH pages AS ( diff --git a/sql/README.md b/sql/README.md index 5ba7cd3..8b2d326 100644 --- a/sql/README.md +++ b/sql/README.md @@ -18,6 +18,10 @@ Once you are ready to add a new query to the repository, open a pull request fol ## Query index +### 2023/10 + +* [Counts for Content-Types used for WordPress pages](./2023/10/page-content-types.sql) + ### 2023/08 * [Counts for WordPress theme/plugin script placements (whether blocking/async/defer in head/footer)](./2023/08/theme-plugin-script-placements.sql) From 2028ae60bf31d77c0c0ae10ceff8e2b5a0b7627d Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Thu, 5 Oct 2023 14:11:08 -0700 Subject: [PATCH 3/9] Improve joining Co-authored-by: Felix Arntz --- sql/2023/10/page-content-types.sql | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/2023/10/page-content-types.sql b/sql/2023/10/page-content-types.sql index dd40943..3156276 100644 --- a/sql/2023/10/page-content-types.sql +++ b/sql/2023/10/page-content-types.sql @@ -19,7 +19,7 @@ WITH pages AS ( SELECT - page + page AS url FROM `httparchive.all.pages`, UNNEST(technologies) AS t @@ -49,10 +49,10 @@ SELECT COUNT(content_type) AS count FROM requests -JOIN +INNER JOIN pages -ON - pages.page = requests.url +USING + (url) GROUP BY content_type ORDER BY From 09b4d7f6a9bc9aaff97349fa0b5d3c5816c8bec5 Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Thu, 5 Oct 2023 14:20:44 -0700 Subject: [PATCH 4/9] Add is_root_page constraint to pages query --- sql/2023/10/page-content-types.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/2023/10/page-content-types.sql b/sql/2023/10/page-content-types.sql index 3156276..43ce54e 100644 --- a/sql/2023/10/page-content-types.sql +++ b/sql/2023/10/page-content-types.sql @@ -39,6 +39,7 @@ WITH UNNEST(response_headers) as resp_headers WHERE date = "2023-08-01" AND + is_root_page AND lower(resp_headers.name) = 'content-type' AND is_main_document AND root_page = url From 00b24e2b18a3e377efa510499754de2021e5914a Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Thu, 5 Oct 2023 14:24:07 -0700 Subject: [PATCH 5/9] Restrict pages to mobile clients --- sql/2023/10/page-content-types.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sql/2023/10/page-content-types.sql b/sql/2023/10/page-content-types.sql index 43ce54e..fa95cc0 100644 --- a/sql/2023/10/page-content-types.sql +++ b/sql/2023/10/page-content-types.sql @@ -26,6 +26,7 @@ WITH WHERE date = '2023-08-01' AND is_root_page AND + client = 'mobile' AND t.technology = 'WordPress' ), @@ -40,6 +41,7 @@ WITH WHERE date = "2023-08-01" AND is_root_page AND + client = 'mobile' AND lower(resp_headers.name) = 'content-type' AND is_main_document AND root_page = url From b6c2ff749d8fd43c47a92a5ba8f8cf75fff6cbb4 Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Thu, 26 Oct 2023 10:50:39 -0700 Subject: [PATCH 6/9] Apply formatting to WITH clause Co-authored-by: Felix Arntz --- sql/2023/10/page-content-types.sql | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sql/2023/10/page-content-types.sql b/sql/2023/10/page-content-types.sql index fa95cc0..8567bde 100644 --- a/sql/2023/10/page-content-types.sql +++ b/sql/2023/10/page-content-types.sql @@ -15,9 +15,7 @@ # limitations under the License. # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/74 -WITH - - pages AS ( +WITH pages AS ( SELECT page AS url FROM From b8999e10888cd5416028d6f4021dc01787137b87 Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Thu, 26 Oct 2023 10:52:30 -0700 Subject: [PATCH 7/9] Remove redundant root_page check Co-authored-by: Felix Arntz --- sql/2023/10/page-content-types.sql | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/2023/10/page-content-types.sql b/sql/2023/10/page-content-types.sql index 8567bde..1b83756 100644 --- a/sql/2023/10/page-content-types.sql +++ b/sql/2023/10/page-content-types.sql @@ -41,8 +41,7 @@ WITH pages AS ( is_root_page AND client = 'mobile' AND lower(resp_headers.name) = 'content-type' AND - is_main_document AND - root_page = url + is_main_document ) SELECT From 27c8751d52d1e7fd7144d89937d3980f022a4b8b Mon Sep 17 00:00:00 2001 From: Weston Ruter Date: Thu, 26 Oct 2023 10:54:29 -0700 Subject: [PATCH 8/9] Include both desktop and mobile clients Co-authored-by: Felix Arntz --- sql/2023/10/page-content-types.sql | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sql/2023/10/page-content-types.sql b/sql/2023/10/page-content-types.sql index 1b83756..f2bcde4 100644 --- a/sql/2023/10/page-content-types.sql +++ b/sql/2023/10/page-content-types.sql @@ -17,6 +17,7 @@ # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/74 WITH pages AS ( SELECT + client, page AS url FROM `httparchive.all.pages`, @@ -24,13 +25,13 @@ WITH pages AS ( WHERE date = '2023-08-01' AND is_root_page AND - client = 'mobile' AND t.technology = 'WordPress' ), # h/t https://discuss.httparchive.org/t/help-finding-list-of-home-pages-with-specific-http-response-header/2567/2 requests AS ( SELECT + client, url, REGEXP_REPLACE( resp_headers.value, ' *;.*$', '' ) AS content_type FROM @@ -39,21 +40,23 @@ WITH pages AS ( WHERE date = "2023-08-01" AND is_root_page AND - client = 'mobile' AND lower(resp_headers.name) = 'content-type' AND is_main_document ) SELECT + client, content_type, - COUNT(content_type) AS count + COUNT(url) AS count FROM requests INNER JOIN pages USING - (url) + (client, url) GROUP BY + client, content_type ORDER BY + client, count DESC From b8241fd4a7ed108074654afa286f60100f5cbd07 Mon Sep 17 00:00:00 2001 From: Felix Arntz Date: Thu, 26 Oct 2023 14:42:38 -0500 Subject: [PATCH 9/9] Remove extra indentation. --- sql/2023/10/page-content-types.sql | 52 +++++++++++++++--------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/sql/2023/10/page-content-types.sql b/sql/2023/10/page-content-types.sql index f2bcde4..abbce71 100644 --- a/sql/2023/10/page-content-types.sql +++ b/sql/2023/10/page-content-types.sql @@ -16,33 +16,33 @@ # See query results here: https://github.com/GoogleChromeLabs/wpp-research/pull/74 WITH pages AS ( - SELECT - client, - page AS url - FROM - `httparchive.all.pages`, - UNNEST(technologies) AS t - WHERE - date = '2023-08-01' AND - is_root_page AND - t.technology = 'WordPress' - ), + SELECT + client, + page AS url + FROM + `httparchive.all.pages`, + UNNEST(technologies) AS t + WHERE + date = '2023-08-01' AND + is_root_page AND + t.technology = 'WordPress' +), - # h/t https://discuss.httparchive.org/t/help-finding-list-of-home-pages-with-specific-http-response-header/2567/2 - requests AS ( - SELECT - client, - url, - REGEXP_REPLACE( resp_headers.value, ' *;.*$', '' ) AS content_type - FROM - `httparchive.all.requests`, - UNNEST(response_headers) as resp_headers - WHERE - date = "2023-08-01" AND - is_root_page AND - lower(resp_headers.name) = 'content-type' AND - is_main_document - ) +# h/t https://discuss.httparchive.org/t/help-finding-list-of-home-pages-with-specific-http-response-header/2567/2 +requests AS ( + SELECT + client, + url, + REGEXP_REPLACE( resp_headers.value, ' *;.*$', '' ) AS content_type + FROM + `httparchive.all.requests`, + UNNEST(response_headers) as resp_headers + WHERE + date = "2023-08-01" AND + is_root_page AND + lower(resp_headers.name) = 'content-type' AND + is_main_document +) SELECT client,