Add basic netlify redirects support (#187)

untitaker · web-flow · commit d38fcd8b867d · 2025-11-23T18:30:28.000+01:00
* Add basic netlify redirects support

* ignore nonroot redirects

* fix tests on windows

* fixup tests

* restructure docs

* fix install-tester workflow

* upgrade dist
diff --git a/README.md b/README.md
@@ -184,6 +184,41 @@ and `--github-actions` feature.
   fairly feature-rich, but was a non-starter due to performance. This applies
   to other countless link checkers we tried that are not mentioned here.
 
+## Redirects
+
+Since 0.1.45 `hyperlink` supports reading configured redirects from a file.
+
+At the root of your site, make a file `_redirects`:
+
+```
+# lines starting with # are ignored
+/old-url.html /new-url.html
+
+# on the next line, trailing data like the 301 status code is ignored
+/old-url2.html /new-url2.html  301
+
+# /old-url.html will become a valid link target
+# hyperlink will validate that /new-url.html exists.
+```
+
+This format is supported by at least Netlify, [Codeberg
+pages](https://codeberg.page) and [Grebedoc](https://grebedoc.dev)
+
+References for this format can be found at
+[Codeberg](https://docs.codeberg.org/codeberg-pages/redirects/) and
+[Netlify](https://docs.netlify.com/manage/routing/redirects/overview/).
+
+The major things missing from the implementation are:
+
+* `hyperlink` completely ignores any status codes or country code conditions.
+  The only thing it parses are `from to`, and the rest is ignored.
+
+* "Splat sources" (`/articles/*`) and "splat targets" (`/posts/:splat`) are
+  not supported.
+
+* Generally speaking, `hyperlink` does not support "pretty URLs", i.e. one
+  cannot request `/mypage` and expect `mypage.html` to be loaded.
+
 ## Testimonials
 
 > We use Hyperlink to check for dead links on
diff --git a/src/html/mod.rs b/src/html/mod.rs
@@ -3,7 +3,7 @@ mod parser;
 use std::borrow::Cow;
 use std::fmt;
 use std::fs;
-use std::io::Read;
+use std::io::{BufRead, BufReader, Read};
 use std::path::{Path, PathBuf};
 use std::str;
 use std::sync::Arc;
@@ -308,6 +308,41 @@ impl Document {
         Href(href.into_bump_str())
     }
 
+    pub fn extract_links<'b, 'l, P: ParagraphWalker, F>(
+        &self,
+        doc_buf: &'b mut DocumentBuffers,
+        check_anchors: bool,
+        mut callback: F,
+    ) -> Result<bool, Error>
+    where
+        'b: 'l,
+        F: FnMut(Link<'l, P::Paragraph>),
+    {
+        if self.href == "_redirects" {
+            for link in self.parse_redirects::<P>(doc_buf, check_anchors)? {
+                callback(link);
+            }
+            return Ok(true);
+        }
+
+        if self
+            .path
+            .extension()
+            .and_then(|extension| {
+                let ext = extension.to_str()?;
+                Some(ext == "html" || ext == "htm")
+            })
+            .unwrap_or(false)
+        {
+            for link in self.links_from_html::<P>(doc_buf, check_anchors)? {
+                callback(link);
+            }
+            return Ok(true);
+        }
+
+        Ok(false)
+    }
+
     pub fn links<'b, 'l, P: ParagraphWalker>(
         &self,
         doc_buf: &'b mut DocumentBuffers,
@@ -319,6 +354,62 @@ impl Document {
         self.links_from_read::<_, P>(doc_buf, fs::File::open(&*self.path)?, check_anchors)
     }
 
+    fn links_from_html<'b, 'l, P: ParagraphWalker>(
+        &self,
+        doc_buf: &'b mut DocumentBuffers,
+        check_anchors: bool,
+    ) -> Result<impl Iterator<Item = Link<'l, P::Paragraph>>, Error>
+    where
+        'b: 'l,
+    {
+        self.links_from_read::<_, P>(doc_buf, fs::File::open(&*self.path)?, check_anchors)
+    }
+
+    fn parse_redirects<'b, 'l, P: ParagraphWalker>(
+        &self,
+        doc_buf: &'b mut DocumentBuffers,
+        check_anchors: bool,
+    ) -> Result<impl Iterator<Item = Link<'l, P::Paragraph>>, Error>
+    where
+        'b: 'l,
+    {
+        let mut link_buf = BumpVec::new_in(&doc_buf.arena);
+        let file = fs::File::open(&*self.path)?;
+        let reader = BufReader::new(file);
+
+        for line in reader.lines() {
+            let line = line?;
+
+            let trimmed = line.trim();
+            if trimmed.is_empty() || trimmed.starts_with('#') {
+                continue;
+            }
+
+            let parts: Vec<&str> = trimmed.split_whitespace().collect();
+            if parts.len() >= 2 {
+                let source = parts[0];
+                let target = parts[1];
+
+                let source_str = doc_buf.arena.alloc_str(source);
+                let target_str = doc_buf.arena.alloc_str(target);
+
+                link_buf.push(Link::Defines(DefinedLink {
+                    href: self.join(&doc_buf.arena, check_anchors, source_str),
+                }));
+
+                if !is_external_link(target.as_bytes()) {
+                    link_buf.push(Link::Uses(UsedLink {
+                        href: self.join(&doc_buf.arena, check_anchors, target_str),
+                        path: self.path.clone(),
+                        paragraph: None,
+                    }));
+                }
+            }
+        }
+
+        Ok(link_buf.into_iter())
+    }
+
     fn links_from_read<'b, 'l, R: Read, P: ParagraphWalker>(
         &self,
         doc_buf: &'b mut DocumentBuffers,
diff --git a/src/main.rs b/src/main.rs
@@ -468,26 +468,17 @@ fn extract_html_links<C: LinkCollector<P::Paragraph>, P: ParagraphWalker>(
                 }));
                 file_count += 1;
 
-                if !document
-                    .path
-                    .extension()
-                    .and_then(|extension| Some(HTML_FILES.contains(&extension.to_str()?)))
-                    .unwrap_or(false)
-                {
-                    return Ok((doc_buf, collector, documents_count, file_count));
+                let was_parsed = document
+                    .extract_links::<P, _>(&mut doc_buf, check_anchors, |link| {
+                        collector.ingest(link);
+                    })
+                    .with_context(|| format!("Failed to read file {}", document.path.display()))?;
+
+                if was_parsed {
+                    doc_buf.reset();
+                    documents_count += 1;
                 }
 
-                for link in document
-                    .links::<P>(&mut doc_buf, check_anchors)
-                    .with_context(|| format!("Failed to read file {}", document.path.display()))?
-                {
-                    collector.ingest(link);
-                }
-
-                doc_buf.reset();
-
-                documents_count += 1;
-
                 Ok((doc_buf, collector, documents_count, file_count))
             },
         )
diff --git a/tests/cli_snapshots.rs b/tests/cli_snapshots.rs
@@ -1,3 +1,4 @@
+use assert_fs::prelude::*;
 use insta_cmd::{assert_cmd_snapshot, get_cargo_bin};
 use std::process::Command;
 
@@ -98,3 +99,85 @@ fn test_version() {
     ----- stderr -----
     "###);
 }
+
+#[test]
+fn test_redirects() {
+    let site = assert_fs::TempDir::new().unwrap();
+
+    site.child("_redirects")
+        .write_str(
+            "# This is a comment\n\
+             \n\
+             /old-page /new-page.html 301\n\
+             /external https://example.com/page\n\
+             /broken /missing-page.html\n\
+             /another /target.html",
+        )
+        .unwrap();
+
+    site.child("new-page.html").touch().unwrap();
+    site.child("target.html").touch().unwrap();
+
+    site.child("index.html")
+        .write_str("<a href='/old-page'>link</a>")
+        .unwrap();
+
+    let mut settings = insta::Settings::clone_current();
+    settings.add_filter(r"[/\\]", "/");
+    let _guard = settings.bind_to_scope();
+
+    assert_cmd_snapshot!(cli().arg(".").current_dir(site.path()), @r###"
+    success: false
+    exit_code: 1
+    ----- stdout -----
+    Reading files
+    Checking 4 links from 4 files (4 documents)
+    ./_redirects
+      error: bad link /missing-page.html
+
+    Found 1 bad links
+
+    ----- stderr -----
+    "###);
+
+    site.close().unwrap();
+}
+
+#[test]
+fn test_redirects_only_at_root() {
+    let site = assert_fs::TempDir::new().unwrap();
+
+    site.child("_redirects")
+        .write_str("/old-page /new-page.html")
+        .unwrap();
+
+    site.child("subdir/_redirects")
+        .write_str("/sub-old /sub-new.html")
+        .unwrap();
+
+    site.child("new-page.html").touch().unwrap();
+
+    site.child("index.html")
+        .write_str("<a href='/old-page'>link to old</a><a href='/sub-old'>link to sub</a>")
+        .unwrap();
+
+    let mut settings = insta::Settings::clone_current();
+    settings.add_filter(r"[/\\]", "/");
+    let _guard = settings.bind_to_scope();
+
+    assert_cmd_snapshot!(cli().arg(".").current_dir(site.path()), @r###"
+    success: false
+    exit_code: 1
+    ----- stdout -----
+    Reading files
+    Checking 3 links from 4 files (3 documents)
+    ./index.html
+      error: bad link /sub-old
+
+    Found 1 bad links
+
+    ----- stderr -----
+    "###);
+
+    site.close().unwrap();
+}