Skip to content

Commit

Permalink
Merge pull request #14 from 3mdeb/ignore_codeblocks
Browse files Browse the repository at this point in the history
namespell: Ignore text inside code blocks
  • Loading branch information
PLangowski authored Oct 22, 2024
2 parents 8ef0a2a + 7516397 commit 7dacedb
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 8 deletions.
2 changes: 2 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ default_stages: [pre-commit]

default_install_hook_types: [pre-commit, commit-msg]

exclude: ^hooks/test/

ci:
autoupdate_commit_msg: 'pre-commit: autoupdate hooks'
autofix_prs: false
Expand Down
147 changes: 139 additions & 8 deletions hooks/namespell.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
import os
import re
import sys
import time
from collections import deque
from importlib import metadata
from typing import List, NamedTuple

NAME_RULES = {
"Zarhus": "Zarhus",
Expand All @@ -28,15 +31,129 @@
IGNORE_STRING = "namespell:disable"


def __get_comment_string(file) -> str:
_, extension = os.path.splitext(f"./{file.name}")
class IgnoreBlock(NamedTuple):
start: str
end: str


class IgnoreInline(NamedTuple):
start: str
end: str


IGNORE_BLOCKS = {
".md": [IgnoreBlock("```", "```"), IgnoreBlock("<!--", "-->")],
"default": [],
}

IGNORE_INLINE = {
".md": [IgnoreInline("`", "`"), IgnoreInline("<!--", "-->")],
"default": [],
}

# Stack that keeps track of "active" blocks to be ignored by storing their
# start strings. This is necessary for nested blocks (e.g code block inside
# comment)
blocks = deque()
current_start_token = ""


def __get_comment_string(extension) -> str:
return (
COMMENT_STRINGS[extension]
if extension in COMMENT_STRINGS.keys()
else COMMENT_STRINGS["default"]
)


def __get_ignore_blocks(extension) -> List[IgnoreBlock]:
return (
IGNORE_BLOCKS[extension]
if extension in IGNORE_BLOCKS.keys()
else IGNORE_BLOCKS["default"]
)


def __get_inline_ignore(extension) -> List[IgnoreInline]:
return (
IGNORE_INLINE[extension]
if extension in IGNORE_INLINE.keys()
else IGNORE_INLINE["default"]
)


def __log_verbose(message, verbose):
if verbose:
print(message)


# Check if the line starts/ends a block to be ignored and modify the blocks
# stack accordingly
def __check_block(ignore_blocks, line, verbose=False):
global current_start_token
for ignore_block in ignore_blocks:
start_index = line.find(ignore_block.start)
end_index = line.find(ignore_block.end)
if start_index != -1:
# For identical start and end tokens: If the last start token was
# the same as this one, then this one is an end token
if (
ignore_block.start == ignore_block.end
and current_start_token == ignore_block.start
):
start_index = -1
else:
__log_verbose(
f"BLOCK START: {ignore_block.start}, INDEX: {start_index}", verbose
)
blocks.append(ignore_block.start)
current_start_token = ignore_block.start
# Check if block end matches the latest block start token
# If true, remove it from the stack
if end_index != -1 and end_index != start_index:
__log_verbose(f"BLOCK END: {ignore_block.end}, INDEX: {end_index}", verbose)
if current_start_token == ignore_block.start:
current_start_token = ""
try:
blocks.pop()
if blocks:
# Get element from the top without popping it
current_start_token = blocks[-1]
except IndexError:
return


# return list of indices of words to be ignored
def __check_inline_ignore(ignore_inline, line, verbose=False):
indices = []
for element in ignore_inline:
start_index = line.find(element.start)
if start_index == -1:
continue
position = 0
line_to_process = line
while start_index != -1:
__log_verbose(f"INLINE BLOCK START: {position + start_index}", verbose)
position += start_index
line_to_process = line_to_process[start_index + 1 :]
end_index = line_to_process.find(element.end) + 1
if end_index == -1:
print("Error: No matching inline comment/code ending tag")
__log_verbose(f"INLINE BLOCK END: {position + end_index}", verbose)
line_slice = line_to_process[: end_index - 1]
for name in NAME_RULES.keys():
pattern = re.compile(
rf"(?<![-_\./=\"#]){re.escape(name)}(?![-_\./=\"#])", re.IGNORECASE
)
for m in re.finditer(pattern, line_slice):
i = position + m.start() + 1
indices.append(i)
position += end_index + 1
line_to_process = line_to_process[end_index:]
start_index = line_to_process.find(element.start)
return indices


def __get_active_rules(
filename, line, line_number, line_ignore_pattern, file_ignore_pattern
):
Expand Down Expand Up @@ -73,10 +190,14 @@ def __get_active_rules(
return active_rules, are_file_rules


def check_and_fix_file(filename, autofix=False):
def check_and_fix_file(filename, autofix=False, verbose=False):
with open(filename, "r", encoding="utf8", errors="ignore") as file:
lines = file.readlines()
comment_string = __get_comment_string(file)
_, extension = os.path.splitext(f"./{file.name}")
comment_string = __get_comment_string(extension)
ignore_blocks = __get_ignore_blocks(extension)
ignore_inline = __get_inline_ignore(extension)
__log_verbose(f"FILE: {filename}", verbose)

# Don't check empty files
if len(lines) == 0:
Expand All @@ -95,7 +216,13 @@ def check_and_fix_file(filename, autofix=False):
if file_rules == {}:
return True
for line_number, line in enumerate(lines, start=1):
__log_verbose(f"LINE {line_number}", verbose)
fixed_line = line
__check_block(ignore_blocks, line, verbose)
to_ignore = __check_inline_ignore(ignore_inline, line, verbose)
if blocks:
fixed_lines.append(fixed_line)
continue
active_rules = file_rules
if line_number != 1:
line_rules, _ = __get_active_rules(
Expand All @@ -112,12 +239,13 @@ def check_and_fix_file(filename, autofix=False):
pattern = re.compile(
rf"(?<![-_\./=\"#]){re.escape(name)}(?![-_\./=\"#])", re.IGNORECASE
)
matches = pattern.findall(line)
matches = re.finditer(pattern, line)
for match in matches:
if match != correct_format:
__log_verbose(f"FOUND: {match.group()} AT {match.start()}", verbose)
if match.group() != correct_format and match.start() not in to_ignore:
found_issues = True
print(
f"{filename}:{line_number}: '{match}' should be '{correct_format}'"
f"{filename}:{line_number}: '{match.group()}' should be '{correct_format}'"
)
if autofix:
fixed_line = re.sub(pattern, correct_format, fixed_line)
Expand Down Expand Up @@ -147,14 +275,17 @@ def parse_args() -> argparse.Namespace:
help="Automatically fix issues",
)
parser.add_argument("files", nargs="+", help="File(s) to parse")
parser.add_argument(
"--verbose", action="store_true", default=False, help="Run tool in verbose mode"
)
return parser.parse_args()


def main():
args = parse_args()
all_passed = True
for filename in args.files:
if not check_and_fix_file(filename, args.fix):
if not check_and_fix_file(filename, args.fix, args.verbose):
all_passed = False
if not all_passed:
sys.exit(1)
Expand Down
17 changes: 17 additions & 0 deletions hooks/test/errors.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Section

This zarhus should be detected as a mistake. <!-- but this zarhus shouldn't -->

Zarhus `zarhus` Dasharo dasharo

```text
zarhus
dasharo
```

## This zarhus is a mistake

<!--
dasharo
-->
dasharo
25 changes: 25 additions & 0 deletions hooks/test/no_errors.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# This document shall have no errors: `zarhus`

This is inline `zarhus and yocto` and this is inline comment<!-- zarhus --> to be ignored
<!-- This
is block comment and everything zarhus and yocto should be ignored even inline `zarhus`
zarhus
-->

```text
This zarhus
this zarhus and yocto should be zarhus ignored
and `zarhus` this zarhus
```

1. Indented code block

```text
This zarhus
should be zarhus also ignored
and `zarhus` this zarhus
```
It's also possible to disable zarhus and yocto rule inline <!-- namespell:disable Zarhus, Yocto -->
All rules disabled: zarhus, yocto, uefi, nvidia <!-- namespell:disable -->

0 comments on commit 7dacedb

Please sign in to comment.