Skip to content

Commit

Permalink
Don't check preconnect links (#1187)
Browse files Browse the repository at this point in the history
Preconnect links are used to establish a server connection without loading a
specific resource yet. Not always do these links point to a URL that should
return a 200, and they are not user-facing, i.e. they don't show up in the
final rendered version of a page.

Therefore, we should not check them at all; not even in `--include-verbatim`
mode, as they might not point to a valid resource.

This turned out to require a significant overhaul of the html5gum extractor
to handle random attribute ordering correctly. Changes to the html5gum extractor:

* Refactor HTML link extractor for improved performance and maintainability
- Replace Vec<u8> with String for better readability and manipulation
- Introduce Element struct to encapsulate element-related data
- Use `HashMap<String, String>` for current_attributes for efficient lookups
- Add verbatim_stack to properly handle nested verbatim elements
- Remove unsafe code where possible, using String::from_utf8_lossy
- Improve attribute handling with `HashMap` entry API and prioritize `srcset`
- Simplify logic and consolidate verbatim element handling
- Enhance encapsulation in `LinkExtractor` struct
- Improve overall performance with more efficient data structures
- Increase flexibility for future feature additions or modifications

Fixes #897
  • Loading branch information
mre authored Oct 7, 2024
1 parent 8026684 commit 11adc09
Show file tree
Hide file tree
Showing 7 changed files with 306 additions and 207 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion lychee-lib/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ wiremock = "0.6.2"
serde_json = "1.0.128"
rstest = "0.23.0"
toml = "0.8.19"
pretty_assertions = "1.4.0"

[features]

Expand All @@ -91,4 +92,4 @@ vendored-openssl = ["openssl-sys/vendored"]
# See https://users.rust-lang.org/t/36630
check_example_domains = []

default = ["native-tls", "email-check"]
default = ["native-tls", "email-check"]
39 changes: 36 additions & 3 deletions lychee-lib/src/extract/html/html5ever.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ use html5ever::{
tokenizer::{Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts},
};

use super::{super::plaintext::extract_plaintext, is_email_link, is_verbatim_elem, srcset};
use super::{
super::plaintext::extract_raw_uri_from_plaintext, is_email_link, is_verbatim_elem, srcset,
};
use crate::types::uri::raw::RawUri;

#[derive(Clone, Default)]
Expand All @@ -26,7 +28,9 @@ impl TokenSink for LinkExtractor {
if self.current_verbatim_element_name.borrow().is_some() {
return TokenSinkResult::Continue;
}
self.links.borrow_mut().extend(extract_plaintext(&raw));
self.links
.borrow_mut()
.extend(extract_raw_uri_from_plaintext(&raw));
}
Token::TagToken(tag) => {
let Tag {
Expand Down Expand Up @@ -72,6 +76,14 @@ impl TokenSink for LinkExtractor {
}
}

// Check and exclude rel=preconnect. Other than prefetch and preload,
// preconnect only does DNS lookups and might not be a link to a resource
if let Some(rel) = attrs.iter().find(|attr| &attr.name.local == "rel") {
if rel.value.contains("preconnect") {
return TokenSinkResult::Continue;
}
}

for attr in attrs {
let urls = LinkExtractor::extract_urls_from_elem_attr(
&attr.name.local,
Expand All @@ -80,7 +92,7 @@ impl TokenSink for LinkExtractor {
);

let new_urls = match urls {
None => extract_plaintext(&attr.value),
None => extract_raw_uri_from_plaintext(&attr.value),
Some(urls) => urls
.into_iter()
.filter(|url| {
Expand Down Expand Up @@ -140,6 +152,7 @@ impl LinkExtractor {
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1

match (elem_name, attr_name) {

// Common element/attribute combinations for links
(_, "href" | "src" | "cite" | "usemap")
// Less common (but still valid!) combinations
Expand Down Expand Up @@ -380,4 +393,24 @@ mod tests {
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}

#[test]
fn test_skip_preconnect() {
let input = r#"
<link rel="preconnect" href="https://example.com">
"#;

let uris = extract_html(input, false);
assert!(uris.is_empty());
}

#[test]
fn test_skip_preconnect_reverse_order() {
let input = r#"
<link href="https://example.com" rel="preconnect">
"#;

let uris = extract_html(input, false);
assert!(uris.is_empty());
}
}
Loading

0 comments on commit 11adc09

Please sign in to comment.