Merge pull request #14 from 3mdeb/ignore_codeblocks

namespell: Ignore text inside code blocks
3mdeb · Oct 22, 2024 · 7dacedb · 7dacedb
2 parents 8ef0a2a + 7516397
commit 7dacedb
Show file tree

Hide file tree

Showing 4 changed files with 183 additions and 8 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,6 +3,8 @@ default_stages: [pre-commit]
 
 default_install_hook_types: [pre-commit, commit-msg]
 
+exclude: ^hooks/test/
+
 ci:
   autoupdate_commit_msg: 'pre-commit: autoupdate hooks'
   autofix_prs: false

diff --git a/hooks/namespell.py b/hooks/namespell.py
@@ -2,7 +2,10 @@
 import os
 import re
 import sys
+import time
+from collections import deque
 from importlib import metadata
+from typing import List, NamedTuple
 
 NAME_RULES = {
     "Zarhus": "Zarhus",
@@ -28,15 +31,129 @@
 IGNORE_STRING = "namespell:disable"
 
 
-def __get_comment_string(file) -> str:
-    _, extension = os.path.splitext(f"./{file.name}")
+class IgnoreBlock(NamedTuple):
+    start: str
+    end: str
+
+
+class IgnoreInline(NamedTuple):
+    start: str
+    end: str
+
+
+IGNORE_BLOCKS = {
+    ".md": [IgnoreBlock("```", "```"), IgnoreBlock("<!--", "-->")],
+    "default": [],
+}
+
+IGNORE_INLINE = {
+    ".md": [IgnoreInline("`", "`"), IgnoreInline("<!--", "-->")],
+    "default": [],
+}
+
+# Stack that keeps track of "active" blocks to be ignored by storing their
+# start strings. This is necessary for nested blocks (e.g code block inside
+# comment)
+blocks = deque()
+current_start_token = ""
+
+
+def __get_comment_string(extension) -> str:
     return (
         COMMENT_STRINGS[extension]
         if extension in COMMENT_STRINGS.keys()
         else COMMENT_STRINGS["default"]
     )
 
 
+def __get_ignore_blocks(extension) -> List[IgnoreBlock]:
+    return (
+        IGNORE_BLOCKS[extension]
+        if extension in IGNORE_BLOCKS.keys()
+        else IGNORE_BLOCKS["default"]
+    )
+
+
+def __get_inline_ignore(extension) -> List[IgnoreInline]:
+    return (
+        IGNORE_INLINE[extension]
+        if extension in IGNORE_INLINE.keys()
+        else IGNORE_INLINE["default"]
+    )
+
+
+def __log_verbose(message, verbose):
+    if verbose:
+        print(message)
+
+
+# Check if the line starts/ends a block to be ignored and modify the blocks
+# stack accordingly
+def __check_block(ignore_blocks, line, verbose=False):
+    global current_start_token
+    for ignore_block in ignore_blocks:
+        start_index = line.find(ignore_block.start)
+        end_index = line.find(ignore_block.end)
+        if start_index != -1:
+            # For identical start and end tokens: If the last start token was
+            # the same as this one, then this one is an end token
+            if (
+                ignore_block.start == ignore_block.end
+                and current_start_token == ignore_block.start
+            ):
+                start_index = -1
+            else:
+                __log_verbose(
+                    f"BLOCK START: {ignore_block.start}, INDEX: {start_index}", verbose
+                )
+                blocks.append(ignore_block.start)
+                current_start_token = ignore_block.start
+        # Check if block end matches the latest block start token
+        # If true, remove it from the stack
+        if end_index != -1 and end_index != start_index:
+            __log_verbose(f"BLOCK END: {ignore_block.end}, INDEX: {end_index}", verbose)
+            if current_start_token == ignore_block.start:
+                current_start_token = ""
+                try:
+                    blocks.pop()
+                    if blocks:
+                        # Get element from the top without popping it
+                        current_start_token = blocks[-1]
+                except IndexError:
+                    return
+
+
+# return list of indices of words to be ignored
+def __check_inline_ignore(ignore_inline, line, verbose=False):
+    indices = []
+    for element in ignore_inline:
+        start_index = line.find(element.start)
+        if start_index == -1:
+            continue
+        position = 0
+        line_to_process = line
+        while start_index != -1:
+            __log_verbose(f"INLINE BLOCK START: {position + start_index}", verbose)
+            position += start_index
+            line_to_process = line_to_process[start_index + 1 :]
+            end_index = line_to_process.find(element.end) + 1
+            if end_index == -1:
+                print("Error: No matching inline comment/code ending tag")
+            __log_verbose(f"INLINE BLOCK END: {position + end_index}", verbose)
+            line_slice = line_to_process[: end_index - 1]
+            for name in NAME_RULES.keys():
+                pattern = re.compile(
+                    rf"(?<![-_\./=\"#]){re.escape(name)}(?![-_\./=\"#])", re.IGNORECASE
+                )
+                for m in re.finditer(pattern, line_slice):
+                    i = position + m.start() + 1
+                    indices.append(i)
+            position += end_index + 1
+            line_to_process = line_to_process[end_index:]
+            start_index = line_to_process.find(element.start)
+    return indices
+
+
 def __get_active_rules(
     filename, line, line_number, line_ignore_pattern, file_ignore_pattern
 ):
@@ -73,10 +190,14 @@ def __get_active_rules(
     return active_rules, are_file_rules
 
 
-def check_and_fix_file(filename, autofix=False):
+def check_and_fix_file(filename, autofix=False, verbose=False):
     with open(filename, "r", encoding="utf8", errors="ignore") as file:
         lines = file.readlines()
-        comment_string = __get_comment_string(file)
+        _, extension = os.path.splitext(f"./{file.name}")
+        comment_string = __get_comment_string(extension)
+        ignore_blocks = __get_ignore_blocks(extension)
+        ignore_inline = __get_inline_ignore(extension)
+        __log_verbose(f"FILE: {filename}", verbose)
 
     # Don't check empty files
     if len(lines) == 0:
@@ -95,7 +216,13 @@ def check_and_fix_file(filename, autofix=False):
     if file_rules == {}:
         return True
     for line_number, line in enumerate(lines, start=1):
+        __log_verbose(f"LINE {line_number}", verbose)
         fixed_line = line
+        __check_block(ignore_blocks, line, verbose)
+        to_ignore = __check_inline_ignore(ignore_inline, line, verbose)
+        if blocks:
+            fixed_lines.append(fixed_line)
+            continue
         active_rules = file_rules
         if line_number != 1:
             line_rules, _ = __get_active_rules(
@@ -112,12 +239,13 @@ def check_and_fix_file(filename, autofix=False):
             pattern = re.compile(
                 rf"(?<![-_\./=\"#]){re.escape(name)}(?![-_\./=\"#])", re.IGNORECASE
             )
-            matches = pattern.findall(line)
+            matches = re.finditer(pattern, line)
             for match in matches:
-                if match != correct_format:
+                __log_verbose(f"FOUND: {match.group()} AT {match.start()}", verbose)
+                if match.group() != correct_format and match.start() not in to_ignore:
                     found_issues = True
                     print(
-                        f"{filename}:{line_number}: '{match}' should be '{correct_format}'"
+                        f"{filename}:{line_number}: '{match.group()}' should be '{correct_format}'"
                     )
                 if autofix:
                     fixed_line = re.sub(pattern, correct_format, fixed_line)
@@ -147,14 +275,17 @@ def parse_args() -> argparse.Namespace:
         help="Automatically fix issues",
     )
     parser.add_argument("files", nargs="+", help="File(s) to parse")
+    parser.add_argument(
+        "--verbose", action="store_true", default=False, help="Run tool in verbose mode"
+    )
     return parser.parse_args()
 
 
 def main():
     args = parse_args()
     all_passed = True
     for filename in args.files:
-        if not check_and_fix_file(filename, args.fix):
+        if not check_and_fix_file(filename, args.fix, args.verbose):
             all_passed = False
     if not all_passed:
         sys.exit(1)

diff --git a/hooks/test/errors.md b/hooks/test/errors.md
@@ -0,0 +1,17 @@
+# Section
+
+This zarhus should be detected as a mistake. <!-- but this zarhus shouldn't -->
+
+Zarhus `zarhus` Dasharo dasharo
+
+```text
+zarhus
+dasharo
+```
+
+## This zarhus is a mistake
+
+<!--
+dasharo
+-->
+dasharo
diff --git a/hooks/test/no_errors.md b/hooks/test/no_errors.md
@@ -0,0 +1,25 @@
+# This document shall have no errors: `zarhus`
+
+This is inline `zarhus and yocto` and this is inline comment<!-- zarhus --> to be ignored
+<!-- This
+is block comment and everything zarhus and yocto should be ignored even inline `zarhus`
+zarhus
+-->
+
+```text
+This zarhus
+this zarhus and yocto should be zarhus ignored
+and `zarhus` this zarhus
+```
+
+1. Indented code block
+
+    ```text
+    This zarhus
+    should be zarhus also ignored
+    and `zarhus` this zarhus
+    ```
+
+It's also possible to disable zarhus and yocto rule inline <!-- namespell:disable Zarhus, Yocto -->
+
+All rules disabled: zarhus, yocto, uefi, nvidia <!-- namespell:disable -->