Fix bug warning about some invalid directive arguments (#261)

mondeja · web-flow · commit bc8c2813cc66 · 2025-03-07T22:39:25.000+01:00
diff --git a/.gitattributes b/.gitattributes
@@ -1,2 +1,2 @@
 * text=auto
-examples/a-lot-of-includes/ -diff
+examples/a-lot-of-includes/docs/index.md -diff
diff --git a/examples/a-lot-of-includes/docs/index.md b/examples/a-lot-of-includes/docs/index.md
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "mkdocs-include-markdown-plugin"
-version = "7.1.4"
+version = "7.1.5"
 description = "Mkdocs Markdown includer plugin."
 readme = "README.md"
 license = "Apache-2.0"
diff --git a/src/mkdocs_include_markdown_plugin/directive.py b/src/mkdocs_include_markdown_plugin/directive.py
@@ -24,6 +24,7 @@ class DirectiveBoolArgument:  # noqa: D101
 
 
 if TYPE_CHECKING:  # pragma: no cover
+    from collections.abc import Iterable
     from typing import Callable, Literal, TypedDict
 
     DirectiveBoolArgumentsDict = dict[str, DirectiveBoolArgument]
@@ -50,18 +51,18 @@ class DirectiveBoolArgument:  # noqa: D101
 DOUBLE_QUOTED_STR_RE = r'([^"]|(?<=\\)")+'
 SINGLE_QUOTED_STR_RE = r"([^']|(?<=\\)')+"
 
-# In the following regular expression, the substrings "$OPENING_TAG"
-# and "$CLOSING_TAG" will be replaced by the effective opening and
-# closing tags in the `on_config` plugin event.
-INCLUDE_TAG_RE = rf'''
-    (?P<_includer_indent>[ \t\w\\.]*?)$OPENING_TAG
+# In the following regular expression, the substrings "\{%", "%\}"
+# will be replaced by custom opening and closing tags in the `on_config`
+# plugin event if required.
+INCLUDE_TAG_RE = r'''
+    (?P<_includer_indent>[ \t\w\\.]*?)\{%
     \s*
     include
     \s+
-    (?:"(?P<double_quoted_filename>{DOUBLE_QUOTED_STR_RE})")?(?:'(?P<single_quoted_filename>{SINGLE_QUOTED_STR_RE})')?
+    (?:"(?P<double_quoted_filename>''' + DOUBLE_QUOTED_STR_RE + r''')")?(?:'(?P<single_quoted_filename>''' + SINGLE_QUOTED_STR_RE + r''')')?
     (?P<arguments>.*?)
     \s*
-    $CLOSING_TAG
+    %\}
 '''  # noqa: E501
 
 TRUE_FALSE_STR_BOOL = {
@@ -110,6 +111,7 @@ def str_arg(arg: str) -> re.Pattern[str]:
     'heading-offset': functools.partial(arg, 'heading-offset'),
 }
 
+INCLUDE_MARKDOWN_DIRECTIVE_ARGS = set(ARGUMENT_REGEXES)
 INCLUDE_DIRECTIVE_ARGS = {
     key for key in ARGUMENT_REGEXES if key not in (
         'rewrite-relative-urls', 'heading-offset', 'comments',
@@ -121,6 +123,45 @@ def str_arg(arg: str) -> re.Pattern[str]:
 )
 
 
+def _maybe_arguments_iter(arguments_string: str) -> Iterable[str]:
+    """Iterate over parts of the string that look like arguments."""
+    current_string_opening = ''  # can be either `'` or `"`
+    inside_string = False
+    escaping = False
+    opening_argument = False  # whether we are at the beginning of an argument
+    current_value = ''
+
+    for c in arguments_string:
+        if inside_string:
+            if c == '\\':
+                escaping = not escaping
+                continue
+            elif c == current_string_opening and not escaping:
+                inside_string = False
+                current_string_opening = ''
+            else:
+                escaping = False
+        elif c == '=':
+            new_current_value = ''
+            for ch in reversed(current_value):
+                if ch in string.whitespace:
+                    current_value = new_current_value[::-1]
+                    break
+                new_current_value += ch
+            yield current_value
+            current_value = ''
+            opening_argument = True
+        elif opening_argument:
+            opening_argument = False
+            if c in ('"', "'"):
+                current_string_opening = c
+                inside_string = True
+                current_value += c
+                current_value += c
+        else:
+            current_value += c
+
+
 def warn_invalid_directive_arguments(
     arguments_string: str,
     directive_lineno: Callable[[], int],
@@ -130,18 +171,17 @@ def warn_invalid_directive_arguments(
 ) -> None:
     """Warns about the invalid arguments passed to a directive."""
     valid_args = (
-        INCLUDE_DIRECTIVE_ARGS if directive == 'include'
-        else set(ARGUMENT_REGEXES)
+        INCLUDE_DIRECTIVE_ARGS
+        if directive == 'include'
+        else INCLUDE_MARKDOWN_DIRECTIVE_ARGS
     )
-    for arg_value in WARN_INVALID_DIRECTIVE_ARGS_REGEX.findall(
-        arguments_string,
-    ):
-        if arg_value.split('=', 1)[0] not in valid_args:
+    for maybe_arg in _maybe_arguments_iter(arguments_string):
+        if maybe_arg not in valid_args:
             location = process.file_lineno_message(
                 page_src_path, docs_dir, directive_lineno(),
             )
             logger.warning(
-                f"Invalid argument '{arg_value}' in"
+                f"Invalid argument '{maybe_arg}' in"
                 f" '{directive}' directive at {location}. Ignoring...",
             )
 
@@ -156,9 +196,9 @@ def parse_filename_argument(
         if raw_filename is None:
             filename = None
         else:
-            filename = raw_filename.replace("\\'", "'")
+            filename = raw_filename.replace(r"\'", "'")
     else:
-        filename = raw_filename.replace('\\"', '"')
+        filename = raw_filename.replace(r'\"', '"')
     return filename, raw_filename
 
 
@@ -168,9 +208,9 @@ def parse_string_argument(match: re.Match[str]) -> str | None:
     if value is None:
         value = match[3]
         if value is not None:
-            value = value.replace("\\'", "'")
+            value = value.replace(r"\'", "'")
     else:
-        value = value.replace('\\"', '"')
+        value = value.replace(r'\"', '"')
     return value
 
 
@@ -182,12 +222,24 @@ def create_include_tag(
     Replaces the substrings '$OPENING_TAG' and '$CLOSING_TAG' from
     INCLUDE_TAG_RE by the effective tag.
     """
-    return re.compile(
-        INCLUDE_TAG_RE.replace(' include', f' {tag}', 1).replace(
-            '$OPENING_TAG', re.escape(opening_tag), 1,
-        ).replace('$CLOSING_TAG', re.escape(closing_tag), 1),
-        flags=re.VERBOSE | re.DOTALL,
-    )
+    pattern = INCLUDE_TAG_RE
+    if tag != 'include':
+        pattern = pattern.replace(
+            ' include',
+            (
+                ' include-markdown' if tag == 'include-markdown'
+                else f' {re.escape(tag)}'
+            ),
+            1,
+        )
+
+    if opening_tag != '{%':
+        pattern = pattern.replace(r'\{%', re.escape(opening_tag), 1)
+
+    if closing_tag != '%}':
+        pattern = pattern.replace(r'%\}', re.escape(closing_tag), 1)
+
+    return re.compile(pattern, flags=re.VERBOSE | re.DOTALL)
 
 
 def parse_bool_options(
diff --git a/src/mkdocs_include_markdown_plugin/process.py b/src/mkdocs_include_markdown_plugin/process.py
@@ -249,9 +249,10 @@ def transform_line_by_line_skipping_codeblocks(
         markdown: str,
         func: Callable[[str], str],
 ) -> str:
-    """Apply a transformation line by line in a Markdown text using a function.
+    """Apply a transformation line by line in a Markdown text using a function,.
 
-    Skip fenced codeblock lines, where the transformation never is applied.
+    Skip fenced codeblock lines and empty lines, where the transformation
+    is never applied.
 
     Indented codeblocks are not taken into account because in the practice
     this function is only used for transformations of heading prefixes. See
@@ -263,13 +264,15 @@ def transform_line_by_line_skipping_codeblocks(
 
     lines = []
     for line in io.StringIO(markdown):
+        lstripped_line = line.lstrip()
         if not _current_fcodeblock_delimiter:
-            lstripped_line = line.lstrip()
-            if lstripped_line.startswith(('```', '~~~')):
-                _current_fcodeblock_delimiter = lstripped_line[:3]
+            if lstripped_line.startswith('```'):
+                _current_fcodeblock_delimiter = '```'
+            elif lstripped_line.startswith('~~~'):
+                _current_fcodeblock_delimiter = '~~~'
             else:
                 line = func(line)  # noqa: PLW2901
-        elif line.lstrip().startswith(_current_fcodeblock_delimiter):
+        elif lstripped_line.startswith(_current_fcodeblock_delimiter):
             _current_fcodeblock_delimiter = ''
         lines.append(line)
 
@@ -287,39 +290,27 @@ def rewrite_relative_urls(
     ``source_path`` will still work when inserted into a file at
     ``destination_path``.
     """
-    from urllib.parse import urlparse, urlunparse
-
     def rewrite_url(url: str) -> str:
-        if is_url(url):
-            return url
-
-        scheme, netloc, path, params, query, fragment = urlparse(url)
-
-        # absolute or mail
-        if path.startswith('/') or scheme == 'mailto':
+        if is_url(url) or is_absolute_path(url):
             return url
 
         new_path = os.path.relpath(
-            os.path.join(os.path.dirname(source_path), path),
+            os.path.join(os.path.dirname(source_path), url),
             os.path.dirname(destination_path),
         )
 
         # ensure forward slashes are used, on Windows
         new_path = new_path.replace('\\', '/').replace('//', '/')
 
         try:
-            if path[-1] == '/':
+            if url[-1] == '/':
                 # the above operation removes a trailing slash,
                 # so add it back if it was present in the input
                 new_path += '/'
         except IndexError:  # pragma: no cover
             pass
 
-        # ensure that links to the same file are not rewritten
-        if new_path == '.':
-            new_path = ''
-
-        return urlunparse((scheme, netloc, new_path, params, query, fragment))
+        return new_path
 
     def found_href(m: re.Match[str], url_group_index: int = -1) -> str:
         match_start, match_end = m.span(0)
@@ -528,27 +519,68 @@ def filter_paths(
     return response
 
 
+def _is_valid_url_scheme_char(c: str) -> bool:
+    """Determine is a character is a valid URL scheme character.
+
+    Valid characters are:
+
+    ```
+    abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-.
+    ```
+    """
+    codepoint = ord(c)
+    A = 65
+    Z = 90
+    a = 97
+    z = 122
+    zero = 48
+    nine = 57
+    dot = 46
+    plus = 43
+    minus = 45
+    return (
+        A <= codepoint <= Z
+        or a <= codepoint <= z
+        or zero <= codepoint <= nine
+        or codepoint in (plus, minus, dot)
+    )
+
+
 def is_url(string: str) -> bool:
-    """Determine if a string is an URL."""
-    if ':' not in string:  # fast path
+    """Determine if a string is an URL.
+
+    The implementation has been adapted from `urllib.urlparse`.
+    """
+    i = string.find(':')
+    if i <= 1:  # noqa: PLR2004 -> exclude C: or D: on Windows
         return False
-    from urllib.parse import urlparse
 
     try:
-        result = urlparse(string)
-        return all([result.scheme, result.netloc])
-    except ValueError:  # pragma: no cover
+        return all(_is_valid_url_scheme_char(string[j]) for j in range(i))
+    except (IndexError, ValueError):  # pragma: no cover
         return False
 
 
 def is_relative_path(string: str) -> bool:
     """Check if a string looks like a relative path."""
-    return string.startswith(('./', '../'))
+    try:
+        return (
+            string[0] == '.'
+            and (
+                string[1] == '/'
+                or (string[1] == '.' and string[2] == '/')
+            )
+        )
+    except IndexError:  # pragma: no cover
+        return False
 
 
 def is_absolute_path(string: str) -> bool:
     """Check if a string looks like an absolute path."""
-    return string.startswith((os.sep, '/'))
+    try:
+        return string[0] == '/' or string[0] == os.sep
+    except IndexError:  # pragma: no cover
+        return False
 
 
 def read_file(file_path: str, encoding: str) -> str:
@@ -581,14 +613,12 @@ def read_url(
 def safe_os_path_relpath(path: str, start: str) -> str:
     """Return the relative path of a file from a start directory.
 
-    Safe version of `os.path.relpath` that catches `ValueError` exceptions
-    on Windows and returns the original path in case of error.
+    Safe version of `os.path.relpath` that catches possible `ValueError`
+    exceptions and returns the original path in case of error.
     On Windows, `ValueError` is raised when `path` and `start` are on
     different drives.
     """
-    if os.name != 'nt':  # pragma: nt no cover
-        return os.path.relpath(path, start)
-    try:  # pragma: nt cover
+    try:
         return os.path.relpath(path, start)
     except ValueError:  # pragma: no cover
         return path
diff --git a/tests/test_unit/test_arguments.py b/tests/test_unit/test_arguments.py
@@ -284,7 +284,7 @@ def test_invalid_argument_name(directive, page, tmp_path, plugin, caplog):
 
     assert len(caplog.records) == 1
     assert caplog.records[0].msg == (
-        f"Invalid argument 'invalid-argument=true' in '{directive}'"
+        f"Invalid argument 'invalid-argument' in '{directive}'"
         " directive at includer.md:1. Ignoring..."
     )
 

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`* text=auto`
`2`		`-examples/a-lot-of-includes/ -diff`
	`2`	`+examples/a-lot-of-includes/docs/index.md -diff`
Original file line number	Diff line number	Diff line change
`@@ -284,7 +284,7 @@ def test_invalid_argument_name(directive, page, tmp_path, plugin, caplog):`
`284`	`284`
`285`	`285`	`assert len(caplog.records) == 1`
`286`	`286`	`assert caplog.records[0].msg == (`
`287`		`- f"Invalid argument 'invalid-argument=true' in '{directive}'"`
	`287`	`+ f"Invalid argument 'invalid-argument' in '{directive}'"`
`288`	`288`	`" directive at includer.md:1. Ignoring..."`
`289`	`289`	`)`
`290`	`290`