feat: Resolve relative links to absolute ones, link to generated Markdown files

AlphaBs · pawamoy · web-flow · commit 52e0318be196 · 2025-11-20T14:54:20.000+01:00
Issue-22: #22 PR-26: #26 Co-authored-by: Timothée Mazzucotelli <dev@pawamoy.fr>
diff --git a/src/mkdocs_llmstxt/_internal/plugin.py b/src/mkdocs_llmstxt/_internal/plugin.py
@@ -6,7 +6,7 @@
 from itertools import chain
 from pathlib import Path
 from typing import TYPE_CHECKING, NamedTuple, cast
-from urllib.parse import urljoin
+from urllib.parse import urljoin, urlparse
 
 import mdformat
 from bs4 import BeautifulSoup as Soup
@@ -53,6 +53,7 @@ class MkdocsLLMsTxtPlugin(BasePlugin[_PluginConfig]):
     mkdocs_config: MkDocsConfig
     """The global MkDocs configuration."""
 
+    _base_url: str
     _sections: dict[str, dict[str, str]]
     _file_uris: set[str]
     _md_pages: dict[str, _MDPageInfo]
@@ -88,6 +89,16 @@ def on_config(self, config: MkDocsConfig) -> MkDocsConfig | None:
         if config.site_url is None:
             raise ValueError("'site_url' must be set in the MkDocs configuration to be used with the 'llmstxt' plugin")
         self.mkdocs_config = config
+
+        # Use `base_url` if it exists.
+        if self.config.base_url is not None:
+            self._base_url = cast("str", self.config.base_url)
+        else:
+            # Use `site_url`, which we assume to be always specified.
+            self._base_url = cast("str", self.mkdocs_config.site_url)
+        if not self._base_url.endswith("/"):
+            self._base_url += "/"
+
         return config
 
     def on_files(self, files: Files, *, config: MkDocsConfig) -> Files | None:  # noqa: ARG002
@@ -128,25 +139,18 @@ def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None
                 should_autoclean=self.config.autoclean,
                 preprocess=self.config.preprocess,
                 path=str(path_md),
+                base_uri=self._base_url,
+                page_uri=page.file.dest_uri,
             )
 
             md_url = Path(page.file.dest_uri).with_suffix(".md").as_posix()
             # Apply the same logic as in the `Page.url` property.
             if md_url in (".", "./"):
                 md_url = ""
-
-            # Use `base_url` if it exists.
-            if self.config.base_url is not None:
-                base = cast("str", self.config.base_url)
-            else:
-                # Use `site_url`, which we assume to be always specified.
-                base = cast("str", self.mkdocs_config.site_url)
-            if not base.endswith("/"):
-                base += "/"
-            md_url = urljoin(base, md_url)
+            md_url = urljoin(self._base_url, md_url)
 
             self._md_pages[src_uri] = _MDPageInfo(
-                title=page.title if page.title is not None else src_uri,
+                title=str(page.title) if page.title is not None else src_uri,
                 path_md=path_md,
                 md_url=md_url,
                 content=page_md,
@@ -221,6 +225,8 @@ def _generate_page_markdown(
     should_autoclean: bool,
     preprocess: str | None,
     path: str,
+    base_uri: str,
+    page_uri: str,
 ) -> str:
     """Convert HTML to Markdown.
 
@@ -229,6 +235,8 @@ def _generate_page_markdown(
         should_autoclean: Whether to autoclean the HTML.
         preprocess: An optional path of a Python module containing a `preprocess` function.
         path: The output path of the relevant Markdown file.
+        base_uri: The base URI of the site.
+        page_uri: The destination URI of the page.
 
     Returns:
         The Markdown content.
@@ -238,8 +246,58 @@ def _generate_page_markdown(
         autoclean(soup)
     if preprocess:
         _preprocess(soup, preprocess, path)
+    _convert_to_absolute_links(soup, base_uri, page_uri)
     return mdformat.text(
         _converter.convert_soup(soup),
         options={"wrap": "no"},
         extensions=("tables",),
     )
+
+
+def _convert_to_absolute_links(soup: Soup, base_uri: str, page_uri: str) -> None:
+    """Convert relative links to absolute ones in the HTML.
+
+    Parameters:
+        soup: The soup to modify.
+        base_uri: The base URI of the site.
+        page_uri: The destination URI of the page.
+    """
+    current_dir = Path(page_uri).parent.as_posix()
+
+    # Find all anchor tags with `href` attributes.
+    for link in soup.find_all("a", href=True):
+        href = link.get("href")
+
+        # Skip if `href` is not a string or is empty.
+        if not isinstance(href, str) or not href:
+            continue
+
+        link["href"] = _convert_to_absolute_link(href, base_uri, current_dir)
+
+
+def _convert_to_absolute_link(href: str, base_uri: str, current_dir: str) -> str:
+    # Skip if it's an absolute path
+    if href.startswith("/"):
+        return href
+
+    # Skip if it's an anchor link (starts with `#`).
+    if href.startswith("#"):
+        return href
+
+    # Skip if it's an external link
+    try:
+        if urlparse(href).scheme:
+            return href
+    except ValueError:
+        # Invalid URL, return as is
+        return href
+
+    # Relative path from current directory.
+    relative_base = urljoin(base_uri, current_dir + "/") if current_dir else base_uri
+    final_href = urljoin(relative_base, href)
+
+    # Convert directory paths (ending with `/`) to point to `index.md` files.
+    if final_href.endswith("/"):
+        final_href = final_href + "index.md"
+
+    return final_href
diff --git a/tests/test_link_conversion.py b/tests/test_link_conversion.py
@@ -0,0 +1,50 @@
+"""Unit tests for link conversion helpers."""
+
+from __future__ import annotations
+
+import pytest
+
+from mkdocs_llmstxt._internal.plugin import _convert_to_absolute_link
+
+BASE_URI = "https://example.org/en/0.1.34/"
+PAGE_DIR = "page2"
+
+
+@pytest.mark.parametrize(
+    ("href", "current_dir", "expected"),
+    [
+        ("../", PAGE_DIR, "https://example.org/en/0.1.34/index.md"),
+        ("../page1/", PAGE_DIR, "https://example.org/en/0.1.34/page1/index.md"),
+        ("../dummy/", PAGE_DIR, "https://example.org/en/0.1.34/dummy/index.md"),
+        ("section/guide/", "", "https://example.org/en/0.1.34/section/guide/index.md"),
+        ("../assets/reference.md", PAGE_DIR, "https://example.org/en/0.1.34/assets/reference.md"),
+    ],
+)
+def test_relative_links_are_made_absolute(href: str, current_dir: str, expected: str) -> None:
+    """Relative links should be converted into absolute Markdown URLs."""
+    assert _convert_to_absolute_link(href, base_uri=BASE_URI, current_dir=current_dir) == expected
+
+
+@pytest.mark.parametrize("href", ["/abs1/", "/abs2/index.md"])
+def test_absolute_paths_are_untouched(href: str) -> None:
+    """Absolute paths must pass through unchanged."""
+    assert _convert_to_absolute_link(href, base_uri=BASE_URI, current_dir=PAGE_DIR) == href
+
+
+@pytest.mark.parametrize(
+    "href",
+    [
+        "https://example.com",
+        "ftp://example.com/resource",
+        "mailto:test@example.com",
+    ],
+)
+def test_external_links_are_preserved(href: str) -> None:
+    """External links should stay untouched."""
+    assert _convert_to_absolute_link(href, base_uri=BASE_URI, current_dir=PAGE_DIR) == href
+
+
+@pytest.mark.parametrize("href", ["#section"])
+def test_anchor_links_are_preserved(href: str) -> None:
+    """Anchor links are not rewritten."""
+    assert _convert_to_absolute_link(href, base_uri=BASE_URI, current_dir=PAGE_DIR) == href
diff --git a/tests/test_plugin.py b/tests/test_plugin.py
@@ -1,6 +1,7 @@
 """Tests for the plugin."""
 
 from pathlib import Path
+from textwrap import dedent
 
 import pytest
 from mkdocs.commands.build import build
@@ -20,14 +21,24 @@
                             "sections": {
                                 "Index": ["index.md"],
                                 "Usage": [{"page1.md": "Some usage docs."}],
+                                "Links": [{"page2.md": "Page with links."}],
                             },
                         },
                     },
                 ],
             },
             "pages": {
                 "index.md": "# Hello world",
+                "dummy.md": "# Hello world",
                 "page1.md": "# Usage\n\nSome paragraph.",
+                "page2.md": dedent(
+                    """
+                    # Links
+
+                    [Relative link 1](./index.md)
+                    [Absolute link 1](/abs1/)
+                    """,
+                ),
             },
         },
     ],
@@ -56,3 +67,14 @@ def test_plugin(mkdocs_conf: MkDocsConfig) -> None:
     page1md = Path(mkdocs_conf.site_dir, "page1/index.md")
     assert page1md.exists()
     assert "Some paragraph." in page1md.read_text()
+
+    page2md = Path(mkdocs_conf.site_dir, "page2/index.md")
+    assert page2md.exists()
+    page2md_content = page2md.read_text()
+
+    # Check that relative links are made absolute in each page and in the full llmstxt file.
+    assert "(https://example.org/en/0.1.34/index.md)" in page2md_content
+    assert "(/abs1/)" in page2md_content
+
+    # Check that llmstxt pages (Markdown) contain links to other llmstxt pages, not HTML ones.
+    assert '"https://example.org/en/0.1.34/index.html"' not in llmsfulltxt_content