66from itertools import chain
77from pathlib import Path
88from typing import TYPE_CHECKING , NamedTuple , cast
9- from urllib .parse import urljoin
9+ from urllib .parse import urljoin , urlparse
1010
1111import mdformat
1212from bs4 import BeautifulSoup as Soup
@@ -53,6 +53,7 @@ class MkdocsLLMsTxtPlugin(BasePlugin[_PluginConfig]):
5353 mkdocs_config : MkDocsConfig
5454 """The global MkDocs configuration."""
5555
56+ _base_url : str
5657 _sections : dict [str , dict [str , str ]]
5758 _file_uris : set [str ]
5859 _md_pages : dict [str , _MDPageInfo ]
@@ -88,6 +89,16 @@ def on_config(self, config: MkDocsConfig) -> MkDocsConfig | None:
8889 if config .site_url is None :
8990 raise ValueError ("'site_url' must be set in the MkDocs configuration to be used with the 'llmstxt' plugin" )
9091 self .mkdocs_config = config
92+
93+ # Use `base_url` if it exists.
94+ if self .config .base_url is not None :
95+ self ._base_url = cast ("str" , self .config .base_url )
96+ else :
97+ # Use `site_url`, which we assume to be always specified.
98+ self ._base_url = cast ("str" , self .mkdocs_config .site_url )
99+ if not self ._base_url .endswith ("/" ):
100+ self ._base_url += "/"
101+
91102 return config
92103
93104 def on_files (self , files : Files , * , config : MkDocsConfig ) -> Files | None : # noqa: ARG002
@@ -128,25 +139,18 @@ def on_page_content(self, html: str, *, page: Page, **kwargs: Any) -> str | None
128139 should_autoclean = self .config .autoclean ,
129140 preprocess = self .config .preprocess ,
130141 path = str (path_md ),
142+ base_uri = self ._base_url ,
143+ page_uri = page .file .dest_uri ,
131144 )
132145
133146 md_url = Path (page .file .dest_uri ).with_suffix (".md" ).as_posix ()
134147 # Apply the same logic as in the `Page.url` property.
135148 if md_url in ("." , "./" ):
136149 md_url = ""
137-
138- # Use `base_url` if it exists.
139- if self .config .base_url is not None :
140- base = cast ("str" , self .config .base_url )
141- else :
142- # Use `site_url`, which we assume to be always specified.
143- base = cast ("str" , self .mkdocs_config .site_url )
144- if not base .endswith ("/" ):
145- base += "/"
146- md_url = urljoin (base , md_url )
150+ md_url = urljoin (self ._base_url , md_url )
147151
148152 self ._md_pages [src_uri ] = _MDPageInfo (
149- title = page .title if page .title is not None else src_uri ,
153+ title = str ( page .title ) if page .title is not None else src_uri ,
150154 path_md = path_md ,
151155 md_url = md_url ,
152156 content = page_md ,
@@ -221,6 +225,8 @@ def _generate_page_markdown(
221225 should_autoclean : bool ,
222226 preprocess : str | None ,
223227 path : str ,
228+ base_uri : str ,
229+ page_uri : str ,
224230) -> str :
225231 """Convert HTML to Markdown.
226232
@@ -229,6 +235,8 @@ def _generate_page_markdown(
229235 should_autoclean: Whether to autoclean the HTML.
230236 preprocess: An optional path of a Python module containing a `preprocess` function.
231237 path: The output path of the relevant Markdown file.
238+ base_uri: The base URI of the site.
239+ page_uri: The destination URI of the page.
232240
233241 Returns:
234242 The Markdown content.
@@ -238,8 +246,58 @@ def _generate_page_markdown(
238246 autoclean (soup )
239247 if preprocess :
240248 _preprocess (soup , preprocess , path )
249+ _convert_to_absolute_links (soup , base_uri , page_uri )
241250 return mdformat .text (
242251 _converter .convert_soup (soup ),
243252 options = {"wrap" : "no" },
244253 extensions = ("tables" ,),
245254 )
255+
256+
257+ def _convert_to_absolute_links (soup : Soup , base_uri : str , page_uri : str ) -> None :
258+ """Convert relative links to absolute ones in the HTML.
259+
260+ Parameters:
261+ soup: The soup to modify.
262+ base_uri: The base URI of the site.
263+ page_uri: The destination URI of the page.
264+ """
265+ current_dir = Path (page_uri ).parent .as_posix ()
266+
267+ # Find all anchor tags with `href` attributes.
268+ for link in soup .find_all ("a" , href = True ):
269+ href = link .get ("href" )
270+
271+ # Skip if `href` is not a string or is empty.
272+ if not isinstance (href , str ) or not href :
273+ continue
274+
275+ link ["href" ] = _convert_to_absolute_link (href , base_uri , current_dir )
276+
277+
278+ def _convert_to_absolute_link (href : str , base_uri : str , current_dir : str ) -> str :
279+ # Skip if it's an absolute path
280+ if href .startswith ("/" ):
281+ return href
282+
283+ # Skip if it's an anchor link (starts with `#`).
284+ if href .startswith ("#" ):
285+ return href
286+
287+ # Skip if it's an external link
288+ try :
289+ if urlparse (href ).scheme :
290+ return href
291+ except ValueError :
292+ # Invalid URL, return as is
293+ return href
294+
295+ # Relative path from current directory.
296+ relative_base = urljoin (base_uri , current_dir + "/" ) if current_dir else base_uri
297+ final_href = urljoin (relative_base , href )
298+
299+ # Convert directory paths (ending with `/`) to point to `index.md` files.
300+ if final_href .endswith ("/" ):
301+ final_href = final_href + "index.md"
302+
303+ return final_href
0 commit comments