Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

apply skip_prefixes before parsing external link domain #1833

Merged
merged 2 commits into from
Apr 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 51 additions & 28 deletions components/site/src/link_checking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
}
}

fn should_skip_by_prefix(link: &String, skip_prefixes: &Vec<String>) -> bool {
skip_prefixes.iter().any(|prefix| link.starts_with(prefix))
}

fn get_link_domain(link: &str) -> Result<String> {
return match Url::parse(link) {
Ok(url) => match url.host_str().map(String::from) {
Expand All @@ -109,36 +113,58 @@ fn get_link_domain(link: &str) -> Result<String> {
pub fn check_external_links(site: &Site) -> Result<()> {
let library = site.library.write().expect("Get lock for check_external_links");

let mut all_links: Vec<(PathBuf, String, String)> = vec![];
struct LinkDef {
file_path: PathBuf,
external_link: String,
domain: String,
Comment on lines +116 to +119
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I created this struct because I find named fields easier to follow than indexed ones, but I'm happy to switch back to the tuple if that's preferred.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is fine

}

impl LinkDef {
pub fn new(file_path: PathBuf, external_link: String, domain: String) -> Self {
Self { file_path, external_link, domain }
}
}

let mut checked_links: Vec<LinkDef> = vec![];
let mut skipped_link_count: u32 = 0;

for p in library.pages_values().into_iter() {
for external_link in p.clone().external_links.into_iter() {
let domain = get_link_domain(&external_link)?;
all_links.push((p.file.path.clone(), external_link, domain));
if should_skip_by_prefix(&external_link, &site.config.link_checker.skip_prefixes) {
skipped_link_count += 1;
} else {
let domain = get_link_domain(&external_link)?;
checked_links.push(LinkDef::new(p.file.path.clone(), external_link, domain));
}
}
}

for s in library.sections_values().into_iter() {
for external_link in s.clone().external_links.into_iter() {
let domain = get_link_domain(&external_link)?;
all_links.push((s.file.path.clone(), external_link, domain));
if should_skip_by_prefix(&external_link, &site.config.link_checker.skip_prefixes) {
skipped_link_count += 1;
} else {
let domain = get_link_domain(&external_link)?;
checked_links.push(LinkDef::new(s.file.path.clone(), external_link, domain));
}
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

those 2 sections should really be refactored but I'll do that in my branch.

Copy link
Contributor Author

@mwcz mwcz Apr 26, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I tried combining the iterators with chain so there would only need to be one loop, but couldn't get it to work.

}

println!("Checking {} external link(s).", all_links.len());
println!(
"Checking {} external link(s). Skipping {} external link(s).",
checked_links.len(),
skipped_link_count
);

let mut links_by_domain: HashMap<String, Vec<(PathBuf, String)>> = HashMap::new();
let mut links_by_domain: HashMap<String, Vec<&LinkDef>> = HashMap::new();

for link in all_links.iter() {
links_by_domain.entry(link.2.to_string()).or_default();
for link in checked_links.iter() {
links_by_domain.entry(link.domain.to_string()).or_default();
// Insert content path and link under the domain key
links_by_domain
.get_mut(&link.2.to_string())
.unwrap()
.push((link.0.clone(), link.1.clone()));
links_by_domain.get_mut(&link.domain).unwrap().push(&link);
}

if all_links.is_empty() {
if checked_links.is_empty() {
return Ok(());
}

Expand All @@ -155,20 +181,13 @@ pub fn check_external_links(site: &Site) -> Result<()> {
let mut links_to_process = links.len();
links
.iter()
.filter_map(move |(page_path, link)| {
.filter_map(move |link_def| {
links_to_process -= 1;

if site
.config
.link_checker
.skip_prefixes
.iter()
.any(|prefix| link.starts_with(prefix))
{
return None;
}

let res = link_checker::check_url(link, &site.config.link_checker);
let res = link_checker::check_url(
&link_def.external_link,
&site.config.link_checker,
);

if links_to_process > 0 {
// Prevent rate-limiting, wait before next crawl unless we're done with this domain
Expand All @@ -178,7 +197,7 @@ pub fn check_external_links(site: &Site) -> Result<()> {
if link_checker::is_valid(&res) {
None
} else {
Some((page_path, link, res))
Some((&link_def.file_path, &link_def.external_link, res))
}
})
.collect::<Vec<_>>()
Expand All @@ -187,7 +206,11 @@ pub fn check_external_links(site: &Site) -> Result<()> {
.collect::<Vec<_>>()
});

println!("> Checked {} external link(s): {} error(s) found.", all_links.len(), errors.len());
println!(
"> Checked {} external link(s): {} error(s) found.",
checked_links.len(),
errors.len()
);

if errors.is_empty() {
return Ok(());
Expand Down
33 changes: 30 additions & 3 deletions components/site/tests/site.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ fn can_parse_site() {
let library = site.library.read().unwrap();

// Correct number of pages (sections do not count as pages, draft are ignored)
assert_eq!(library.pages().len(), 32);
assert_eq!(library.pages().len(), 33);
let posts_path = path.join("content").join("posts");

// Make sure the page with a url doesn't have any sections
Expand Down Expand Up @@ -596,7 +596,7 @@ fn can_build_site_with_pagination_for_taxonomy() {
"tags/a/page/1/index.html",
"http-equiv=\"refresh\" content=\"0; url=https://replace-this-with-your-url.com/tags/a/\""
));
assert!(file_contains!(public, "tags/a/index.html", "Num pagers: 8"));
assert!(file_contains!(public, "tags/a/index.html", "Num pagers: 9"));
assert!(file_contains!(public, "tags/a/index.html", "Page size: 2"));
assert!(file_contains!(public, "tags/a/index.html", "Current index: 1"));
assert!(!file_contains!(public, "tags/a/index.html", "has_prev"));
Expand All @@ -609,7 +609,7 @@ fn can_build_site_with_pagination_for_taxonomy() {
assert!(file_contains!(
public,
"tags/a/index.html",
"Last: https://replace-this-with-your-url.com/tags/a/page/8/"
"Last: https://replace-this-with-your-url.com/tags/a/page/9/"
));
assert!(!file_contains!(public, "tags/a/index.html", "has_prev"));

Expand Down Expand Up @@ -774,8 +774,35 @@ fn check_site() {
site.config.link_checker.skip_anchor_prefixes,
vec!["https://github.com/rust-lang/rust/blob/"]
);
assert_eq!(
site.config.link_checker.skip_prefixes,
vec!["http://[2001:db8::]/", "http://invaliddomain"]
);

site.config.enable_check_mode();
site.load().expect("link check test_site");
}

#[test]
#[should_panic]
fn panics_on_invalid_external_domain() {
let (mut site, _tmp_dir, _public) = build_site("test_site");

// remove the invalid domain skip prefix
let i = site
.config
.link_checker
.skip_prefixes
.iter()
.position(|prefix| prefix == "http://invaliddomain")
.unwrap();
site.config.link_checker.skip_prefixes.remove(i);

// confirm the invalid domain skip prefix was removed
assert_eq!(site.config.link_checker.skip_prefixes, vec!["http://[2001:db8::]/"]);

// check the test site, this time without the invalid domain skip prefix, which should cause a
// panic
site.config.enable_check_mode();
site.load().expect("link check test_site");
}
Expand Down
1 change: 1 addition & 0 deletions test_site/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ anchors = "on"
[link_checker]
skip_prefixes = [
"http://[2001:db8::]/",
"http://invaliddomain",
]

skip_anchor_prefixes = [
Expand Down
4 changes: 4 additions & 0 deletions test_site/content/posts/skip_prefixes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
+++
+++

[test skip 1](http://invaliddomain</)