From 3d10611765a23107da2dfb520f0448c3b0d6f207 Mon Sep 17 00:00:00 2001 From: Matthias Date: Sun, 30 Jul 2023 00:40:13 +0200 Subject: [PATCH] Don't check preconnect links Preconnect links are used to establish a server connection without loading a specific resource yet. Not always do these links point to a URL that should return a 200, and they are not user-facing, i.e. they don't show up in the final rendered version of a page. Therefore, I think we should them at all; not even in `--include-verbatim` mode, as they might not point to a valid resource. Fixes #897 --- lychee-lib/src/extract/html/html5ever.rs | 20 ++++++++++++++++++++ lychee-lib/src/extract/html/html5gum.rs | 23 ++++++++++++++++++++++- lychee-lib/src/extract/mod.rs | 3 ++- 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/lychee-lib/src/extract/html/html5ever.rs b/lychee-lib/src/extract/html/html5ever.rs index 25ed226aa2..ba29673dbb 100644 --- a/lychee-lib/src/extract/html/html5ever.rs +++ b/lychee-lib/src/extract/html/html5ever.rs @@ -69,6 +69,14 @@ impl TokenSink for LinkExtractor { } } + // Check and exclude rel=preconnect. Other than prefetch and preload, + // preconnect only does DNS lookups and might not be a link to a resource + if let Some(rel) = attrs.iter().find(|attr| &attr.name.local == "rel") { + if rel.value.contains("preconnect") { + return TokenSinkResult::Continue; + } + } + for attr in attrs { let urls = LinkExtractor::extract_urls_from_elem_attr( &attr.name.local, @@ -136,6 +144,8 @@ impl LinkExtractor { // and https://html.spec.whatwg.org/multipage/indices.html#attributes-1 match (elem_name, attr_name) { + // TODO: Skip + // Common element/attribute combinations for links (_, "href" | "src" | "cite" | "usemap") // Less common (but still valid!) combinations @@ -353,4 +363,14 @@ mod tests { let uris = extract_html(input, false); assert_eq!(uris, expected); } + + #[test] + fn test_skip_preconnect() { + let input = r#" + + "#; + + let uris = extract_html(input, false); + assert!(uris.is_empty()); + } } diff --git a/lychee-lib/src/extract/html/html5gum.rs b/lychee-lib/src/extract/html/html5gum.rs index de55ab4665..96c7b73bef 100644 --- a/lychee-lib/src/extract/html/html5gum.rs +++ b/lychee-lib/src/extract/html/html5gum.rs @@ -4,6 +4,7 @@ use super::{is_email_link, is_verbatim_elem, srcset}; use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri}; #[derive(Clone)] +#[allow(clippy::struct_excessive_bools)] struct LinkExtractor { // note: what html5gum calls a tag, lychee calls an element links: Vec, @@ -11,6 +12,7 @@ struct LinkExtractor { current_element_name: Vec, current_element_is_closing: bool, current_element_nofollow: bool, + current_element_preconnect: bool, current_attribute_name: Vec, current_attribute_value: Vec, last_start_element: Vec, @@ -33,6 +35,7 @@ impl LinkExtractor { current_element_name: Vec::new(), current_element_is_closing: false, current_element_nofollow: false, + current_element_preconnect: false, current_attribute_name: Vec::new(), current_attribute_value: Vec::new(), last_start_element: Vec::new(), @@ -147,7 +150,15 @@ impl LinkExtractor { if attr == "rel" && value.contains("nofollow") { self.current_element_nofollow = true; } - if self.current_element_nofollow { + + // Ignore links with rel=preconnect + // Other than prefetch and preload, preconnect only makes + // a DNS lookup, so we don't want to extract those links. + if attr == "rel" && value.contains("preconnect") { + self.current_element_preconnect = true; + } + + if self.current_element_nofollow || self.current_element_preconnect { self.current_attribute_name.clear(); self.current_attribute_value.clear(); return; @@ -507,4 +518,14 @@ mod tests { let uris = extract_html(input, false); assert_eq!(uris, expected); } + + #[test] + fn test_skip_preconnect() { + let input = r#" + + "#; + + let uris = extract_html(input, false); + assert!(uris.is_empty()); + } } diff --git a/lychee-lib/src/extract/mod.rs b/lychee-lib/src/extract/mod.rs index 4b30ef4112..78dc88c2d9 100644 --- a/lychee-lib/src/extract/mod.rs +++ b/lychee-lib/src/extract/mod.rs @@ -245,7 +245,8 @@ mod tests { let expected_links = IntoIterator::into_iter([ website("https://example.com/"), website("https://example.com/favicon.ico"), - website("https://fonts.externalsite.com"), + // Note that we exclude `preconnect` links: + // website("https://fonts.externalsite.com"), website("https://example.com/docs/"), website("https://example.com/forum"), ])