Skip to content

Commit

Permalink
Add heuristic checking for HTML anchors (#1716)
Browse files Browse the repository at this point in the history
* Add heuristic checking for HTML anchors

Previously only anchors specified or generated in markdown could be
linked to, without complaint from the link checker. We now use a
simple heuristic check for `name` or `id` attributes.

Duplicate code has been refactored and all XML anchor checks updated
to use regex rather than substring match.

* Fix regexp and refactor
  • Loading branch information
phillord authored and Keats committed Jan 23, 2022
1 parent 0b7e3cb commit 5190b5e
Show file tree
Hide file tree
Showing 9 changed files with 65 additions and 24 deletions.
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 1 addition & 6 deletions components/config/src/config/link_checker.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use serde_derive::{Deserialize, Serialize};

#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(default)]
pub struct LinkChecker {
/// Skip link checking for these URL prefixes
Expand All @@ -9,8 +9,3 @@ pub struct LinkChecker {
pub skip_anchor_prefixes: Vec<String>,
}

impl Default for LinkChecker {
fn default() -> LinkChecker {
LinkChecker { skip_prefixes: Vec::new(), skip_anchor_prefixes: Vec::new() }
}
}
5 changes: 5 additions & 0 deletions components/library/src/content/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use crate::content::file_info::FileInfo;
use crate::content::ser::SerializingPage;
use crate::content::{find_related_assets, has_anchor};
use utils::fs::read_file;
use utils::links::has_anchor_id;

lazy_static! {
// Based on https://regex101.com/r/H2n38Z/1/tests
Expand Down Expand Up @@ -300,6 +301,10 @@ impl Page {
has_anchor(&self.toc, anchor)
}

pub fn has_anchor_id(&self, id: &str) -> bool {
has_anchor_id(&self.content, id)
}

pub fn to_serialized<'a>(&'a self, library: &'a Library) -> SerializingPage<'a> {
SerializingPage::from_page(self, library)
}
Expand Down
1 change: 1 addition & 0 deletions components/link_checker/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ lazy_static = "1"

config = { path = "../config" }
errors = { path = "../errors" }
utils = { path = "../utils" }

[dependencies.reqwest]
version = "0.11"
Expand Down
22 changes: 5 additions & 17 deletions components/link_checker/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use config::LinkChecker;
use std::collections::HashMap;
use std::result;
use std::sync::{Arc, RwLock};
use utils::links::has_anchor_id;

pub type Result = result::Result<StatusCode, String>;

Expand Down Expand Up @@ -104,22 +105,9 @@ fn has_anchor(url: &str) -> bool {
fn check_page_for_anchor(url: &str, body: String) -> errors::Result<()> {
let index = url.find('#').unwrap();
let anchor = url.get(index + 1..).unwrap();
let checks = [
format!(" id={}", anchor),
format!(" ID={}", anchor),
format!(" id='{}'", anchor),
format!(" ID='{}'", anchor),
format!(r#" id="{}""#, anchor),
format!(r#" ID="{}""#, anchor),
format!(" name={}", anchor),
format!(" NAME={}", anchor),
format!(" name='{}'", anchor),
format!(" NAME='{}'", anchor),
format!(r#" name="{}""#, anchor),
format!(r#" NAME="{}""#, anchor),
];

if checks.iter().any(|check| body[..].contains(&check[..])) {


if has_anchor_id(&body, &anchor){
Ok(())
} else {
Err(errors::Error::from(format!("Anchor `#{}` not found on page", anchor)))
Expand Down Expand Up @@ -338,7 +326,7 @@ mod tests {
#[test]
fn skip_anchor_prefixes() {
let ignore_url = format!("{}{}", mockito::server_url(), "/ignore/");
let config = LinkChecker { skip_prefixes: vec![], skip_anchor_prefixes: vec![ignore_url] };
let config = LinkChecker { skip_anchor_prefixes: vec![ignore_url], ..Default::default() };

let _m1 = mock("GET", "/ignore/i30hobj1cy")
.with_header("Content-Type", "text/html")
Expand Down
3 changes: 2 additions & 1 deletion components/site/src/link_checking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
let page = library
.get_page(&full_path)
.expect("Couldn't find section in check_internal_links_with_anchors");
!page.has_anchor(anchor)

!(page.has_anchor(anchor)||page.has_anchor_id(anchor))
}
});

Expand Down
1 change: 1 addition & 0 deletions components/utils/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ include = ["src/**/*"]
tera = "1"
unicode-segmentation = "1.2"
walkdir = "2"
regex="1"
toml = "0.5"
serde = { version = "1.0", features = ["derive"] }
slug = "0.1"
Expand Down
1 change: 1 addition & 0 deletions components/utils/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
pub mod de;
pub mod fs;
pub mod links;
pub mod minify;
pub mod net;
pub mod site;
Expand Down
47 changes: 47 additions & 0 deletions components/utils/src/links.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
use regex::Regex;


pub fn has_anchor_id(content: &str, anchor: &str) -> bool {
let checks = anchor_id_checks(anchor);
checks.is_match(content)
}

fn anchor_id_checks(anchor:&str) -> Regex {
Regex::new(
&format!(r#" (?i)(id|name) *= *("|')*{}("|'| |>)+"#, anchor)
).unwrap()
}


#[cfg(test)]
mod tests{
use super::anchor_id_checks;

fn check(anchor:&str, content:&str) -> bool {
anchor_id_checks(anchor).is_match(content)
}

#[test]
fn matchers () {
let m = |content| {check("fred", content)};

// Canonical match/non match
assert!(m(r#"<a name="fred">"#));
assert!(m(r#"<a id="fred">"#));
assert!(!m(r#"<a name="george">"#));

// Whitespace variants
assert!(m(r#"<a id ="fred">"#));
assert!(m(r#"<a id = "fred">"#));
assert!(m(r#"<a id="fred" >"#));
assert!(m(r#"<a id="fred" >"#));

// Quote variants
assert!(m(r#"<a id='fred'>"#));
assert!(m(r#"<a id=fred>"#));

// Case variants
assert!(m(r#"<a ID="fred">"#));
assert!(m(r#"<a iD="fred">"#));
}
}

0 comments on commit 5190b5e

Please sign in to comment.