diff --git a/README.md b/README.md index f5fa39f..b868539 100644 --- a/README.md +++ b/README.md @@ -182,3 +182,48 @@ You can also change the actual identifier of a heading, thanks again to the `att ``` ...though note that this will impact the URL anchor too (and therefore the permalink to the heading). + +### Backlinks + +The autorefs plugin supports recording backlinks, that other plugins or systems can then use to render backlinks into pages. + +For example, when linking from page `foo/`, section `Section` to a heading with identifier `heading` thanks to a cross-reference `[Some heading][heading]`, the plugin will record that `foo/#section` references `heading`. + +```md +# Page foo + +This is page foo. + +## Section + +This section references [some heading][heading]. +``` + +Other plugins or systems integrating with the autorefs plugin can then retrieve backlinks for a specific identifier: + +```python +backlinks = autorefs_plugin.get_backlinks("heading") +``` + +The `get_backlinks` method returns a map of backlink types to sets of backlinks. A backlink is a tuple of navigation breadcrumbs, each breadcrumb having a title and URL. + +```python +print(backlinks) +# { +# "referenced-by": { +# Backlink( +# crumbs=( +# BacklinkCrumb(title="Foo", url="foo/"), +# BacklinkCrumb(title="Section", url="foo/#section"), +# ), +# ), +# } +``` + +The default backlink type is `referenced-by`, but can be customized by other plugins or systems thanks to the `backlink-type` HTML data attribute on `autoref` elements. Such plugins and systems can also specify the anchor on the current page to use for the backlink with the `backlink-anchor` HTML data attribute on `autoref` elements. + +```html + +``` + +This feature is typically designed for use in [mkdocstrings](https://mkdocstrings.github.io/) handlers. diff --git a/src/mkdocs_autorefs/backlinks.py b/src/mkdocs_autorefs/backlinks.py new file mode 100644 index 0000000..90a5f96 --- /dev/null +++ b/src/mkdocs_autorefs/backlinks.py @@ -0,0 +1,81 @@ +"""Backlinks module.""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import TYPE_CHECKING, ClassVar + +from markdown.core import Markdown +from markdown.treeprocessors import Treeprocessor + +if TYPE_CHECKING: + from xml.etree.ElementTree import Element + + from markdown import Markdown + + from mkdocs_autorefs.plugin import AutorefsPlugin + +try: + from mkdocs.plugins import get_plugin_logger + + log = get_plugin_logger(__name__) +except ImportError: + # TODO: remove once support for MkDocs <1.5 is dropped + log = logging.getLogger(f"mkdocs.plugins.{__name__}") # type: ignore[assignment] + + +@dataclass(eq=True, frozen=True, order=True) +class BacklinkCrumb: + """A navigation breadcrumb for a backlink.""" + + title: str + url: str + + +@dataclass(eq=True, frozen=True, order=True) +class Backlink: + """A backlink (list of breadcrumbs).""" + + crumbs: tuple[BacklinkCrumb, ...] + + +class BacklinksTreeProcessor(Treeprocessor): + """Enhance autorefs with `backlink-type` and `backlink-anchor` attributes. + + These attributes are then used later to register backlinks. + """ + + name: str = "mkdocs-autorefs-backlinks" + initial_id: str | None = None + _htags: ClassVar[set[str]] = {"h1", "h2", "h3", "h4", "h5", "h6"} + + def __init__(self, plugin: AutorefsPlugin, md: Markdown | None = None) -> None: + """Initialize the tree processor. + + Parameters: + plugin: A reference to the autorefs plugin, to use its `register_anchor` method. + """ + super().__init__(md) + self.plugin = plugin + self.last_heading_id: str | None = None + + def run(self, root: Element) -> None: # noqa: D102 + if self.plugin.current_page is not None: + self.last_heading_id = self.initial_id + self._enhance_autorefs(root) + + def _enhance_autorefs(self, parent: Element) -> None: + for el in parent: + if el.tag == "a": # Markdown anchor. + if not (el.text or el.get("href") or (el.tail and el.tail.strip())) and (anchor_id := el.get("id")): + self.last_heading_id = anchor_id + elif el.tag in self._htags: # Heading. + self.last_heading_id = el.get("id") + elif el.tag == "autoref": + if "backlink-type" not in el.attrib: + el.set("backlink-type", "referenced-by") + if "backlink-anchor" not in el.attrib and self.last_heading_id: + el.set("backlink-anchor", self.last_heading_id) + else: + self._enhance_autorefs(el) diff --git a/src/mkdocs_autorefs/plugin.py b/src/mkdocs_autorefs/plugin.py index 051b074..a85ed59 100644 --- a/src/mkdocs_autorefs/plugin.py +++ b/src/mkdocs_autorefs/plugin.py @@ -12,6 +12,7 @@ import contextlib import functools import logging +from collections import defaultdict from pathlib import PurePosixPath as URL # noqa: N814 from typing import TYPE_CHECKING, Any, Callable, Literal from urllib.parse import urlsplit @@ -22,6 +23,7 @@ from mkdocs.plugins import BasePlugin, event_priority from mkdocs.structure.pages import Page +from mkdocs_autorefs.backlinks import Backlink, BacklinkCrumb from mkdocs_autorefs.references import AutorefsExtension, fix_refs, relative_url if TYPE_CHECKING: @@ -30,6 +32,7 @@ from jinja2.environment import Environment from mkdocs.config.defaults import MkDocsConfig from mkdocs.structure.files import Files + from mkdocs.structure.nav import Section from mkdocs.structure.pages import Page from mkdocs.structure.toc import AnchorLink @@ -99,6 +102,7 @@ class AutorefsPlugin(BasePlugin[AutorefsConfig]): """ scan_toc: bool = True + record_backlinks: bool = False current_page: Page | None = None # YORE: Bump 2: Remove line. legacy_refs: bool = True @@ -135,7 +139,9 @@ def __init__(self) -> None: self._primary_url_map: dict[str, list[str]] = {} self._secondary_url_map: dict[str, list[str]] = {} self._title_map: dict[str, str] = {} + self._backlink_page_map: dict[str, Page] = {} self._abs_url_map: dict[str, str] = {} + self._backlinks: dict[str, dict[str, set[str]]] = defaultdict(lambda: defaultdict(set)) # YORE: Bump 2: Remove line. self._get_fallback_anchor: Callable[[str], tuple[str, ...]] | None = None # YORE: Bump 2: Remove line. @@ -162,6 +168,56 @@ def get_fallback_anchor(self, value: Callable[[str], tuple[str, ...]] | None) -> stacklevel=2, ) + def _record_backlink(self, identifier: str, backlink_type: str, backlink_anchor: str, page_url: str) -> None: + """Record a backlink. + + Arguments: + identifier: The target identifier. + backlink_type: The type of backlink. + backlink_anchor: The backlink target anchor. + page_url: The URL of the page containing the backlink. + """ + # When we record backlinks, all identifiers have been registered. + # If an identifier is not found in the primary or secondary URL maps, it's an absolute URL, + # meaning it comes from an external source (typically an object inventory), + # and we don't need to record backlinks for it. + if identifier in self._primary_url_map or identifier in self._secondary_url_map: + self._backlinks[identifier][backlink_type].add(f"{page_url}#{backlink_anchor}") + + def get_backlinks(self, *identifiers: str, from_url: str) -> dict[str, set[Backlink]]: + """Return the backlinks to an identifier relative to the given URL. + + Arguments: + *identifiers: The identifiers to get backlinks for. + from_url: The URL of the page where backlinks are rendered. + + Returns: + A dictionary of backlinks, with the type of reference as key and a set of backlinks as value. + Each backlink is a tuple of (URL, title) tuples forming navigation breadcrumbs. + """ + relative_backlinks: dict[str, set[Backlink]] = defaultdict(set) + for identifier in set(identifiers): + backlinks = self._backlinks.get(identifier, {}) + for backlink_type, backlink_urls in backlinks.items(): + for backlink_url in backlink_urls: + relative_backlinks[backlink_type].add(self._crumbs(from_url, backlink_url)) + return relative_backlinks + + def _crumbs(self, from_url: str, backlink_url: str) -> Backlink: + backlink_page: Page = self._backlink_page_map[backlink_url] + backlink_title = self._title_map.get(backlink_url, "") + crumbs: list[BacklinkCrumb] = [ + BacklinkCrumb(backlink_title, relative_url(from_url, backlink_url)), + BacklinkCrumb(backlink_page.title, relative_url(from_url, backlink_page.url + "#")), + ] + page: Page | Section = backlink_page + while page.parent: + page = page.parent + if url := getattr(page, "url", ""): + url = relative_url(from_url, url + "#") + crumbs.append(BacklinkCrumb(page.title, url)) + return Backlink(tuple(reversed(crumbs))) + def register_anchor( self, page: Page, @@ -196,6 +252,8 @@ def register_anchor( url_map[identifier] = [url] if title and url not in self._title_map: self._title_map[url] = title + if self.record_backlinks and url not in self._backlink_page_map: + self._backlink_page_map[url] = page def register_url(self, identifier: str, url: str) -> None: """Register that the identifier should be turned into a link to this URL. @@ -406,7 +464,7 @@ def map_urls(self, page: Page, anchor: AnchorLink) -> None: @event_priority(-50) # Late, after mkdocstrings has finished loading inventories. def on_env(self, env: Environment, /, *, config: MkDocsConfig, files: Files) -> Environment: # noqa: ARG002 - """Apply cross-references. + """Apply cross-references and collect backlinks. Hook for the [`on_env` event](https://www.mkdocs.org/user-guide/plugins/#on_env). In this hook, we try to fix unresolved references of the form `[title][identifier]` or `[identifier][]`. @@ -415,6 +473,9 @@ def on_env(self, env: Environment, /, *, config: MkDocsConfig, files: Files) -> We log a warning for each reference that we couldn't map to an URL. + We also collect backlinks at the same time. We fix cross-refs and collect backlinks in a single pass + for performance reasons (we don't want to run the regular expression on each page twice). + Arguments: env: The MkDocs environment. config: The MkDocs config object. @@ -433,10 +494,14 @@ def on_env(self, env: Environment, /, *, config: MkDocsConfig, files: Files) -> from_url=file.page.url, fallback=self.get_fallback_anchor, ) + backlink_recorder = ( + functools.partial(self._record_backlink, page_url=file.page.url) if self.record_backlinks else None + ) # YORE: Bump 2: Replace `, _legacy_refs=self.legacy_refs` with `` within line. file.page.content, unmapped = fix_refs( file.page.content, url_mapper, + record_backlink=backlink_recorder, link_titles=self._link_titles, strip_title_tags=self._strip_title_tags, _legacy_refs=self.legacy_refs, diff --git a/src/mkdocs_autorefs/references.py b/src/mkdocs_autorefs/references.py index b660670..a59f64e 100644 --- a/src/mkdocs_autorefs/references.py +++ b/src/mkdocs_autorefs/references.py @@ -23,6 +23,8 @@ from markdown.util import HTML_PLACEHOLDER_RE, INLINE_PLACEHOLDER_RE from markupsafe import Markup +from mkdocs_autorefs.backlinks import BacklinksTreeProcessor + if TYPE_CHECKING: from collections.abc import Iterable from pathlib import Path @@ -328,6 +330,8 @@ class _AutorefsAttrs(dict): "filepath", "lineno", "slug", + "backlink-type", + "backlink-anchor", } @property @@ -416,6 +420,7 @@ def _strip_tags(html: str) -> str: def fix_ref( url_mapper: Callable[[str], tuple[str, str | None]], unmapped: list[tuple[str, AutorefsHookInterface.Context | None]], + record_backlink: Callable[[str, str, str], None] | None = None, *, link_titles: bool | Literal["external"] = True, strip_title_tags: bool = False, @@ -432,6 +437,7 @@ def fix_ref( url_mapper: A callable that gets an object's site URL by its identifier, such as [mkdocs_autorefs.plugin.AutorefsPlugin.get_item_url][]. unmapped: A list to store unmapped identifiers. + record_backlink: A callable to record backlinks. link_titles: How to set HTML titles on links. Always (`True`), never (`False`), or external-only (`"external"`). strip_title_tags: Whether to strip HTML tags from link titles. @@ -449,6 +455,13 @@ def inner(match: Match) -> str: identifiers = (identifier, slug) if slug else (identifier,) + if ( + record_backlink + and (backlink_type := attrs.get("backlink-type")) + and (backlink_anchor := attrs.get("backlink-anchor")) + ): + record_backlink(identifier, backlink_type, backlink_anchor) + try: url, original_title = _find_url(identifiers, url_mapper) except KeyError: @@ -495,6 +508,7 @@ def fix_refs( html: str, url_mapper: Callable[[str], tuple[str, str | None]], *, + record_backlink: Callable[[str, str, str], None] | None = None, link_titles: bool | Literal["external"] = True, strip_title_tags: bool = False, # YORE: Bump 2: Remove line. @@ -506,6 +520,7 @@ def fix_refs( html: The text to fix. url_mapper: A callable that gets an object's site URL by its identifier, such as [mkdocs_autorefs.plugin.AutorefsPlugin.get_item_url][]. + record_backlink: A callable to record backlinks. link_titles: How to set HTML titles on links. Always (`True`), never (`False`), or external-only (`"external"`). strip_title_tags: Whether to strip HTML tags from link titles. @@ -514,7 +529,7 @@ def fix_refs( """ unmapped: list[tuple[str, AutorefsHookInterface.Context | None]] = [] html = AUTOREF_RE.sub( - fix_ref(url_mapper, unmapped, link_titles=link_titles, strip_title_tags=strip_title_tags), + fix_ref(url_mapper, unmapped, record_backlink, link_titles=link_titles, strip_title_tags=strip_title_tags), html, ) @@ -599,6 +614,11 @@ def _log_enabling_markdown_anchors() -> None: log.debug("Enabling Markdown anchors feature") +@lru_cache +def _log_enabling_backlinks() -> None: + log.debug("Enabling backlinks feature") + + class AutorefsExtension(Extension): """Markdown extension that transforms unresolved references into auto-references. @@ -627,7 +647,8 @@ def extendMarkdown(self, md: Markdown) -> None: # noqa: N802 (casing: parent me Add an instance of our [`AutorefsInlineProcessor`][mkdocs_autorefs.references.AutorefsInlineProcessor] to the Markdown parser. Also optionally add an instance of our [`AnchorScannerTreeProcessor`][mkdocs_autorefs.references.AnchorScannerTreeProcessor] - to the Markdown parser if a reference to the autorefs plugin was passed to this extension. + and [`BacklinksTreeProcessor`][mkdocs_autorefs.references.BacklinksTreeProcessor] to the Markdown parser + if a reference to the autorefs plugin was passed to this extension. Arguments: md: A `markdown.Markdown` instance. @@ -637,10 +658,21 @@ def extendMarkdown(self, md: Markdown) -> None: # noqa: N802 (casing: parent me AutorefsInlineProcessor.name, priority=168, # Right after markdown.inlinepatterns.ReferenceInlineProcessor ) - if self.plugin is not None and self.plugin.scan_toc and "attr_list" in md.treeprocessors: - _log_enabling_markdown_anchors() - md.treeprocessors.register( - AnchorScannerTreeProcessor(self.plugin, md), - AnchorScannerTreeProcessor.name, - priority=0, - ) + if self.plugin is not None: + # Markdown anchors require the `attr_list` extension. + if self.plugin.scan_toc and "attr_list" in md.treeprocessors: + _log_enabling_markdown_anchors() + md.treeprocessors.register( + AnchorScannerTreeProcessor(self.plugin, md), + AnchorScannerTreeProcessor.name, + priority=0, + ) + # Backlinks require IDs on headings, which are either set by `toc`, + # or manually by the user with `attr_list`. + if self.plugin.record_backlinks and ("attr_list" in md.treeprocessors or "toc" in md.treeprocessors): + _log_enabling_backlinks() + md.treeprocessors.register( + BacklinksTreeProcessor(self.plugin, md), + BacklinksTreeProcessor.name, + priority=0, + ) diff --git a/tests/test_backlinks.py b/tests/test_backlinks.py new file mode 100644 index 0000000..c52bcd6 --- /dev/null +++ b/tests/test_backlinks.py @@ -0,0 +1,64 @@ +"""Tests for the backlinks module.""" + +from __future__ import annotations + +from textwrap import dedent + +from markdown import Markdown + +from mkdocs_autorefs.backlinks import Backlink, BacklinkCrumb +from mkdocs_autorefs.plugin import AutorefsPlugin +from mkdocs_autorefs.references import AUTOREF_RE, AutorefsExtension, _html_attrs_parser +from tests.helpers import create_page + + +def test_record_backlinks() -> None: + """Check that only useful backlinks are recorded.""" + plugin = AutorefsPlugin() + plugin._record_backlink("foo", "referenced-by", "foo", "foo.html") + assert "foo" not in plugin._backlinks + + plugin.register_anchor(identifier="foo", page=create_page("foo.html"), primary=True) + plugin._record_backlink("foo", "referenced-by", "foo", "foo.html") + assert "foo" in plugin._backlinks + + +def test_get_backlinks() -> None: + """Check that backlinks can be retrieved.""" + plugin = AutorefsPlugin() + plugin.record_backlinks = True + plugin.register_anchor(identifier="foo", page=create_page("foo.html"), primary=True) + plugin._record_backlink("foo", "referenced-by", "foo", "foo.html") + assert plugin.get_backlinks("foo", from_url="") == { + "referenced-by": { + Backlink( + crumbs=( + BacklinkCrumb(title="foo.html", url="foo.html#"), + BacklinkCrumb(title="", url="foo.html#foo"), + ), + ), + }, + } + + +def test_backlinks_treeprocessor() -> None: + """Check that the backlinks treeprocessor works.""" + plugin = AutorefsPlugin() + plugin.record_backlinks = True + plugin.current_page = create_page("foo.html") + md = Markdown(extensions=["attr_list", "toc", AutorefsExtension(plugin)]) + html = md.convert( + dedent( + """ + [](){#alias} + ## Heading + + [Foo][foo] + """, + ), + ) + match = AUTOREF_RE.search(html) + assert match + attrs = _html_attrs_parser.parse(f"") + assert "backlink-type" in attrs + assert "backlink-anchor" in attrs