Skip to content

Commit 52e0318

Browse files
AlphaBspawamoy
andauthored
feat: Resolve relative links to absolute ones, link to generated Markdown files
Issue-22: #22 PR-26: #26 Co-authored-by: Timothée Mazzucotelli <dev@pawamoy.fr>
1 parent ad6dec6 commit 52e0318

File tree

3 files changed

+142
-12
lines changed

3 files changed

+142
-12
lines changed

src/mkdocs_llmstxt/_internal/plugin.py

Lines changed: 70 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from itertools import chain
77
from pathlib import Path
88
from typing import TYPE_CHECKING, NamedTuple, cast
9-
from urllib.parse import urljoin
9+
from urllib.parse import urljoin, urlparse
1010

1111
import mdformat
1212
from bs4 import BeautifulSoup as Soup
@@ -53,6 +53,7 @@ class MkdocsLLMsTxtPlugin(BasePlugin[_PluginConfig]):
5353
mkdocs_config: MkDocsConfig
5454
"""The global MkDocs configuration."""
5555

56+
_base_url: str
5657
_sections: dict[str, dict[str, str]]
5758
_file_uris: set[str]
5859
_md_pages: dict[str, _MDPageInfo]
@@ -88,6 +89,16 @@ def on_config(self, config: MkDocsConfig) -> MkDocsConfig | None:
8889
if config.site_url is None:
8990
raise ValueError("'site_url' must be set in the MkDocs configuration to be used with the 'llmstxt' plugin")
9091
self.mkdocs_config = config
92+
93+
# Use `base_url` if it exists.
94+
if self.config.base_url is not None:
95+
self._base_url = cast("str", self.config.base_url)
96+
else:
97+
# Use `site_url`, which we assume to be always specified.
98+
self._base_url = cast("str", self.mkdocs_config.site_url)
99+
if not self._base_url.endswith("/"):
100+
self._base_url += "/"
101+
91102
return config
92103

93104
def on_files(self, files: Files, *, config: MkDocsConfig) -> Files | None: # noqa: ARG002
@@ -128,25 +139,18 @@ def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None
128139
should_autoclean=self.config.autoclean,
129140
preprocess=self.config.preprocess,
130141
path=str(path_md),
142+
base_uri=self._base_url,
143+
page_uri=page.file.dest_uri,
131144
)
132145

133146
md_url = Path(page.file.dest_uri).with_suffix(".md").as_posix()
134147
# Apply the same logic as in the `Page.url` property.
135148
if md_url in (".", "./"):
136149
md_url = ""
137-
138-
# Use `base_url` if it exists.
139-
if self.config.base_url is not None:
140-
base = cast("str", self.config.base_url)
141-
else:
142-
# Use `site_url`, which we assume to be always specified.
143-
base = cast("str", self.mkdocs_config.site_url)
144-
if not base.endswith("/"):
145-
base += "/"
146-
md_url = urljoin(base, md_url)
150+
md_url = urljoin(self._base_url, md_url)
147151

148152
self._md_pages[src_uri] = _MDPageInfo(
149-
title=page.title if page.title is not None else src_uri,
153+
title=str(page.title) if page.title is not None else src_uri,
150154
path_md=path_md,
151155
md_url=md_url,
152156
content=page_md,
@@ -221,6 +225,8 @@ def _generate_page_markdown(
221225
should_autoclean: bool,
222226
preprocess: str | None,
223227
path: str,
228+
base_uri: str,
229+
page_uri: str,
224230
) -> str:
225231
"""Convert HTML to Markdown.
226232
@@ -229,6 +235,8 @@ def _generate_page_markdown(
229235
should_autoclean: Whether to autoclean the HTML.
230236
preprocess: An optional path of a Python module containing a `preprocess` function.
231237
path: The output path of the relevant Markdown file.
238+
base_uri: The base URI of the site.
239+
page_uri: The destination URI of the page.
232240
233241
Returns:
234242
The Markdown content.
@@ -238,8 +246,58 @@ def _generate_page_markdown(
238246
autoclean(soup)
239247
if preprocess:
240248
_preprocess(soup, preprocess, path)
249+
_convert_to_absolute_links(soup, base_uri, page_uri)
241250
return mdformat.text(
242251
_converter.convert_soup(soup),
243252
options={"wrap": "no"},
244253
extensions=("tables",),
245254
)
255+
256+
257+
def _convert_to_absolute_links(soup: Soup, base_uri: str, page_uri: str) -> None:
258+
"""Convert relative links to absolute ones in the HTML.
259+
260+
Parameters:
261+
soup: The soup to modify.
262+
base_uri: The base URI of the site.
263+
page_uri: The destination URI of the page.
264+
"""
265+
current_dir = Path(page_uri).parent.as_posix()
266+
267+
# Find all anchor tags with `href` attributes.
268+
for link in soup.find_all("a", href=True):
269+
href = link.get("href")
270+
271+
# Skip if `href` is not a string or is empty.
272+
if not isinstance(href, str) or not href:
273+
continue
274+
275+
link["href"] = _convert_to_absolute_link(href, base_uri, current_dir)
276+
277+
278+
def _convert_to_absolute_link(href: str, base_uri: str, current_dir: str) -> str:
279+
# Skip if it's an absolute path
280+
if href.startswith("/"):
281+
return href
282+
283+
# Skip if it's an anchor link (starts with `#`).
284+
if href.startswith("#"):
285+
return href
286+
287+
# Skip if it's an external link
288+
try:
289+
if urlparse(href).scheme:
290+
return href
291+
except ValueError:
292+
# Invalid URL, return as is
293+
return href
294+
295+
# Relative path from current directory.
296+
relative_base = urljoin(base_uri, current_dir + "/") if current_dir else base_uri
297+
final_href = urljoin(relative_base, href)
298+
299+
# Convert directory paths (ending with `/`) to point to `index.md` files.
300+
if final_href.endswith("/"):
301+
final_href = final_href + "index.md"
302+
303+
return final_href

tests/test_link_conversion.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""Unit tests for link conversion helpers."""
2+
3+
from __future__ import annotations
4+
5+
import pytest
6+
7+
from mkdocs_llmstxt._internal.plugin import _convert_to_absolute_link
8+
9+
BASE_URI = "https://example.org/en/0.1.34/"
10+
PAGE_DIR = "page2"
11+
12+
13+
@pytest.mark.parametrize(
14+
("href", "current_dir", "expected"),
15+
[
16+
("../", PAGE_DIR, "https://example.org/en/0.1.34/index.md"),
17+
("../page1/", PAGE_DIR, "https://example.org/en/0.1.34/page1/index.md"),
18+
("../dummy/", PAGE_DIR, "https://example.org/en/0.1.34/dummy/index.md"),
19+
("section/guide/", "", "https://example.org/en/0.1.34/section/guide/index.md"),
20+
("../assets/reference.md", PAGE_DIR, "https://example.org/en/0.1.34/assets/reference.md"),
21+
],
22+
)
23+
def test_relative_links_are_made_absolute(href: str, current_dir: str, expected: str) -> None:
24+
"""Relative links should be converted into absolute Markdown URLs."""
25+
assert _convert_to_absolute_link(href, base_uri=BASE_URI, current_dir=current_dir) == expected
26+
27+
28+
@pytest.mark.parametrize("href", ["/abs1/", "/abs2/index.md"])
29+
def test_absolute_paths_are_untouched(href: str) -> None:
30+
"""Absolute paths must pass through unchanged."""
31+
assert _convert_to_absolute_link(href, base_uri=BASE_URI, current_dir=PAGE_DIR) == href
32+
33+
34+
@pytest.mark.parametrize(
35+
"href",
36+
[
37+
"https://example.com",
38+
"ftp://example.com/resource",
39+
"mailto:test@example.com",
40+
],
41+
)
42+
def test_external_links_are_preserved(href: str) -> None:
43+
"""External links should stay untouched."""
44+
assert _convert_to_absolute_link(href, base_uri=BASE_URI, current_dir=PAGE_DIR) == href
45+
46+
47+
@pytest.mark.parametrize("href", ["#section"])
48+
def test_anchor_links_are_preserved(href: str) -> None:
49+
"""Anchor links are not rewritten."""
50+
assert _convert_to_absolute_link(href, base_uri=BASE_URI, current_dir=PAGE_DIR) == href

tests/test_plugin.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Tests for the plugin."""
22

33
from pathlib import Path
4+
from textwrap import dedent
45

56
import pytest
67
from mkdocs.commands.build import build
@@ -20,14 +21,24 @@
2021
"sections": {
2122
"Index": ["index.md"],
2223
"Usage": [{"page1.md": "Some usage docs."}],
24+
"Links": [{"page2.md": "Page with links."}],
2325
},
2426
},
2527
},
2628
],
2729
},
2830
"pages": {
2931
"index.md": "# Hello world",
32+
"dummy.md": "# Hello world",
3033
"page1.md": "# Usage\n\nSome paragraph.",
34+
"page2.md": dedent(
35+
"""
36+
# Links
37+
38+
[Relative link 1](./index.md)
39+
[Absolute link 1](/abs1/)
40+
""",
41+
),
3142
},
3243
},
3344
],
@@ -56,3 +67,14 @@ def test_plugin(mkdocs_conf: MkDocsConfig) -> None:
5667
page1md = Path(mkdocs_conf.site_dir, "page1/index.md")
5768
assert page1md.exists()
5869
assert "Some paragraph." in page1md.read_text()
70+
71+
page2md = Path(mkdocs_conf.site_dir, "page2/index.md")
72+
assert page2md.exists()
73+
page2md_content = page2md.read_text()
74+
75+
# Check that relative links are made absolute in each page and in the full llmstxt file.
76+
assert "(https://example.org/en/0.1.34/index.md)" in page2md_content
77+
assert "(/abs1/)" in page2md_content
78+
79+
# Check that llmstxt pages (Markdown) contain links to other llmstxt pages, not HTML ones.
80+
assert '"https://example.org/en/0.1.34/index.html"' not in llmsfulltxt_content

0 commit comments

Comments
 (0)