codeblocks.py

#!/usr/bin/env python3

from enum import Enum
from itertools import chain
from tempfile import NamedTemporaryFile

import argparse
import glob
import os
import os.path
import re
import shutil
import sys

class Codeblock(Enum):
    IN = 1
    IN_WITH_LANG = 2
    OUT = 3

INTLINE_PATTERN = re.compile("```+[^`]+`")
DEFAULT_LANG = "txt"

try:
    from magika import Magika
    magika = Magika()
    def guess_language(code):
        codebytes = code.encode(encoding="utf-8")
        lang = magika.identify_bytes(codebytes).prediction.output.label
        return lang if lang != "unknown" else DEFAULT_LANG
except ImportError:
    try:
        from guesslang import Guess
        guesslang = Guess()
        def guess_language(code):
            lang = guesslang.language_name(code)
            return lang.lower() if lang else DEFAULT_LANG
    except ImportError:
        print("Magika or Guesslang is required to run this script. Install one of them to proceed!")
        sys.exit(1)

def add_language(files, edit_files):
    def is_made_of_char(str, char):
        return len(str) == str.count(char)
    # newline="" is important. See below.
    temp = NamedTemporaryFile("w+", encoding="utf-8", newline="", delete=False)
    for file in files:
        blockstate = Codeblock.OUT
        code = []
        # The argument newline="" is important here and in the NamedTemporaryFile() call above.
        # We don't want to change line endings in a file which we're editing.
        with open(file, encoding="utf-8", newline="") as f:
            for linenum, line in enumerate(f, 1):
                stripped = line.strip()
                if (stripped.startswith("```") and not INTLINE_PATTERN.match(stripped)
                        and (blockstate == Codeblock.OUT or is_made_of_char(stripped, "`")
                            and len(stripped) >= backticks_num)):
                    if  blockstate == Codeblock.IN_WITH_LANG:
                        blockstate = Codeblock.OUT
                        if edit_files: temp.write(line)
                    elif blockstate == Codeblock.IN:
                        blockstate = Codeblock.OUT
                        code_str = "\n".join(line.removeprefix(indent).rstrip() for line in code) + "\n"
                        lang = guess_language(code_str) if code_str else ""
                        if edit_files:
                            # When editing files, txt is not very useful edit.
                            if lang == "txt": lang = ""
                            fence_start = "`" * backticks_num
                            temp.write(fence.replace(fence_start, fence_start + lang))
                            temp.writelines(code)
                            temp.write(line)
                        elif lang:
                            print(f"{file}:{linenum - len(code) - 1}")
                            print(("`" * backticks_num) + lang + "\n" + code_str + stripped)
                            print()
                        code = []
                    elif is_made_of_char(stripped, "`"):
                        backticks_num = len(stripped)
                        blockstate = Codeblock.IN
                        count = len(line) - len(line.lstrip())
                        indent = line[:count]
                        fence = line
                    else:
                        backticks_num = stripped.count("`")
                        blockstate = Codeblock.IN_WITH_LANG
                        if edit_files: temp.write(line)
                elif blockstate == Codeblock.IN:
                    code.append(line)
                elif edit_files:
                    temp.write(line)
            if edit_files:
                if code:
                    # non-terminated fence
                    temp.write(fence)
                    temp.writelines(code)
                temp.flush()
                shutil.copy(temp.name, file)
                temp.seek(0)
                temp.truncate(0)
    temp.close()
    os.unlink(temp.name)

def main():
    parser = argparse.ArgumentParser(description="Detect and insert the language in the Markdown code blocks.")
    parser.add_argument("--edit", action="store_true", help="Edit files by inserting the language")
    parser.add_argument("path", nargs="+", help="Paths to process")
    args = parser.parse_args()

    files = []
    iters = []

    for path in args.path:
        if os.path.isfile(path):
            files.append(path)
        elif os.path.isdir(path):
            iters.append(glob.iglob(os.path.join(path, "**", "*.md"), recursive=True))
        else:
            print(f"Path doesn't exist: \"{path}\"")
            sys.exit(1)

    iters.append(iter(files))
    add_language(chain.from_iterable(iters), args.edit)

if __name__ == "__main__":
    main()