From c317ea967b822ef687e60be8a9b5af779eecb906 Mon Sep 17 00:00:00 2001
From: Jason Yundt <jason@jasonyundt.email>
Date: Sat, 30 Dec 2023 12:51:24 -0500
Subject: [PATCH] decoder: Autodetect detect encoding of YAML files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before this change, yamllint would open YAML files using open()’s
default encoding. As long as UTF-8 mode isn’t enabled, open() defaults
to using the system’s locale encoding [1][2].

Most of the time, the locale encoding on Linux systems is UTF-8 [3][4],
but it doesn’t have to be [5]. Additionally, the locale encoding on
Windows systems is the system’s ANSI code page [6]. As a result, you
would have to either enable UTF-8 mode, give Python a custom manifest or
enable a beta feature in Windows settings in order to lint UTF-8 YAML
files on Windows [2][7].

Finally, using open()’s default encoding is a violation of the YAML
spec. Chapter 5.2 says:

	“On input, a YAML processor must support the UTF-8 and UTF-16
	character encodings. For JSON compatibility, the UTF-32
	encodings must also be supported.

	If a character stream begins with a byte order mark, the
	character encoding will be taken to be as indicated by the byte
	order mark. Otherwise, the stream must begin with an ASCII
	character. This allows the encoding to be deduced by the pattern
	of null (x00) characters.” [8]

This change fixes all of those problems by implementing the YAML spec’s
character encoding detection algorithm. Now, as long as YAML files
begins with either a byte order mark or an ASCII character, yamllint
will automatically detect them as being UTF-8, UTF-16 or UTF-32. Other
character encodings are not supported at the moment.

Fixes #218. Fixes #238. Fixes #347.

[1]: <https://docs.python.org/3.12/library/functions.html#open>
[2]: <https://docs.python.org/3.12/library/os.html#utf8-mode>
[3]: <https://sourceware.org/glibc/manual/html_node/Extended-Char-Intro.html>
[4]: <https://wiki.musl-libc.org/functional-differences-from-glibc.html#Character-sets-and-locale>
[5]: <https://sourceware.org/git/?p=glibc.git;a=blob;f=localedata/SUPPORTED;h=c8b63cc2fe2b4547f2fb1bff6193da68d70bd563;hb=36f2487f13e3540be9ee0fb51876b1da72176d3f>
[6]: <https://docs.python.org/3.12/glossary.html#term-locale-encoding>
[7]: <https://learn.microsoft.com/en-us/windows/apps/design/globalizing/use-utf8-code-page>
[8]: <https://yaml.org/spec/1.2.2/#52-character-encodings>
---
 tests/common.py       | 107 ++++++++++++++++++++
 tests/test_cli.py     |  53 +++++++++-
 tests/test_decoder.py | 221 ++++++++++++++++++++++++++++++++++++++++++
 yamllint/cli.py       |   2 +-
 yamllint/config.py    |   5 +-
 yamllint/decoder.py   |  60 ++++++++++++
 yamllint/linter.py    |   3 +
 7 files changed, 447 insertions(+), 4 deletions(-)
 create mode 100644 tests/test_decoder.py
 create mode 100644 yamllint/decoder.py

diff --git a/tests/common.py b/tests/common.py
index 78bb9cbf..579e853f 100644
--- a/tests/common.py
+++ b/tests/common.py
@@ -13,11 +13,16 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
+import codecs
+from codecs import CodecInfo as CI
+import collections
 import contextlib
 import os
 import shutil
+import sys
 import tempfile
 import unittest
+import warnings
 
 import yaml
 
@@ -25,6 +30,7 @@
 from yamllint import linter
 
 
+# Rule related stuff:
 class RuleTestCase(unittest.TestCase):
     def build_fake_config(self, conf):
         if conf is None:
@@ -54,6 +60,10 @@ def check(self, source, conf, **kwargs):
         self.assertEqual(real_problems, expected_problems)
 
 
+# Workspace related stuff:
+Blob = collections.namedtuple('Blob', ('text', 'encoding'))
+
+
 def build_temp_workspace(files):
     tempdir = tempfile.mkdtemp(prefix='yamllint-tests-')
 
@@ -65,6 +75,8 @@ def build_temp_workspace(files):
         if type(content) is list:
             os.mkdir(path)
         else:
+            if isinstance(content, Blob):
+                content = content.text.encode(content.encoding)
             mode = 'wb' if isinstance(content, bytes) else 'w'
             with open(path, mode) as f:
                 f.write(content)
@@ -84,3 +96,98 @@ def temp_workspace(files):
     finally:
         os.chdir(backup_wd)
         shutil.rmtree(wd)
+
+
+# Encoding related stuff:
+def encode_utf_32_be_sig(obj, errors='strict'):
+    return (
+        codecs.BOM_UTF32_BE + codecs.encode(obj, 'utf_32_be', errors),
+        len(obj)
+    )
+
+
+def encode_utf_32_le_sig(obj, errors='strict'):
+    return (
+        codecs.BOM_UTF32_LE + codecs.encode(obj, 'utf_32_le', errors),
+        len(obj)
+    )
+
+
+def encode_utf_16_be_sig(obj, errors='strict'):
+    return (
+        codecs.BOM_UTF16_BE + codecs.encode(obj, 'utf_16_be', errors),
+        len(obj)
+    )
+
+
+def encode_utf_16_le_sig(obj, errors='strict'):
+    return (
+        codecs.BOM_UTF16_LE + codecs.encode(obj, 'utf_16_le', errors),
+        len(obj)
+    )
+
+
+test_codec_infos = {
+    'utf_32_be_sig': CI(encode_utf_32_be_sig, codecs.getdecoder('utf_32')),
+    'utf_32_le_sig': CI(encode_utf_32_le_sig, codecs.getdecoder('utf_32')),
+    'utf_16_be_sig': CI(encode_utf_16_be_sig, codecs.getdecoder('utf_16')),
+    'utf_16_le_sig': CI(encode_utf_16_le_sig, codecs.getdecoder('utf_16')),
+}
+
+
+def register_test_codecs():
+    codecs.register(test_codec_infos.get)
+
+
+def unregister_test_codecs():
+    if sys.version_info >= (3, 10, 0):
+        codecs.unregister(test_codec_infos.get)
+    else:
+        warnings.warn(
+            "This version of Python doesn’t allow us to unregister codecs."
+        )
+
+
+def is_test_codec(codec):
+    return codec in test_codec_infos.keys()
+
+
+def test_codec_built_in_equivalent(test_codec):
+    return_value = test_codec
+    for suffix in ('_sig', '_be', '_le'):
+        return_value = return_value.replace(suffix, '')
+    return return_value
+
+
+def uses_bom(codec):
+    for suffix in ('_32', '_16', '_sig'):
+        if codec.endswith(suffix):
+            return True
+    return False
+
+
+def encoding_detectable(string, codec):
+    """
+    Returns True if encoding can be detected after string is encoded
+
+    Encoding detection only works if you’re using a BOM or the first character
+    is ASCII. See yamllint.decoder.auto_decode()’s docstring.
+    """
+    return uses_bom(codec) or (len(string) > 0 and string[0].isascii())
+
+
+def utf_codecs():
+    for chunk_size in ('32', '16'):
+        for endianness in ('be', 'le'):
+            for sig in ('', '_sig'):
+                yield f'utf_{chunk_size}_{endianness}{sig}'
+    yield 'utf_8_sig'
+    yield 'utf_8'
+
+
+def ws_with_files_in_many_codecs(path_template, text):
+    workspace = {}
+    for codec in utf_codecs():
+        if encoding_detectable(text, codec):
+            workspace[path_template.format(codec)] = Blob(text, codec)
+    return workspace
diff --git a/tests/test_cli.py b/tests/test_cli.py
index d158e326..3922eb5e 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -23,7 +23,9 @@
 import tempfile
 import unittest
 
-from tests.common import build_temp_workspace, temp_workspace
+from tests.common import (build_temp_workspace, temp_workspace,
+                          ws_with_files_in_many_codecs,
+                          register_test_codecs, unregister_test_codecs)
 
 from yamllint import cli
 from yamllint import config
@@ -797,3 +799,52 @@ def test_multiple_parent_config_file(self):
         self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr),
                          (0, './4spaces.yml:2:5: [warning] wrong indentation: '
                          'expected 3 but found 4 (indentation)\n', ''))
+
+
+class CommandLineEncodingTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        register_test_codecs()
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+        unregister_test_codecs()
+
+    def test_valid_encodings(self):
+        conf = ('---\n'
+                'rules:\n'
+                '  key-ordering: enable\n')
+        config_files = ws_with_files_in_many_codecs(
+            'config_{}.yaml',
+            conf
+        )
+        sorted_correctly = ('---\n'
+                            'Ａ: YAML\n'
+                            'Ｚ: YAML\n')
+        sorted_correctly_files = ws_with_files_in_many_codecs(
+            'sorted_correctly/{}.yaml',
+            sorted_correctly
+        )
+        sorted_incorrectly = ('---\n'
+                              'Ｚ: YAML\n'
+                              'Ａ: YAML\n')
+        sorted_incorrectly_files = ws_with_files_in_many_codecs(
+            'sorted_incorrectly/{}.yaml',
+            sorted_incorrectly
+        )
+        workspace = {
+            **config_files,
+            **sorted_correctly_files,
+            **sorted_incorrectly_files
+        }
+
+        with temp_workspace(workspace):
+            for config_path in config_files.keys():
+                with RunContext(self) as ctx:
+                    cli.run(('-c', config_path, 'sorted_correctly/'))
+                self.assertEqual(ctx.returncode, 0)
+                with RunContext(self) as ctx:
+                    cli.run(('-c', config_path, 'sorted_incorrectly/'))
+                self.assertNotEqual(ctx.returncode, 0)
diff --git a/tests/test_decoder.py b/tests/test_decoder.py
new file mode 100644
index 00000000..3bb5d1af
--- /dev/null
+++ b/tests/test_decoder.py
@@ -0,0 +1,221 @@
+# Copyright (C) 2023 Jason Yundt
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import codecs
+import unittest
+
+from tests.common import (register_test_codecs, unregister_test_codecs,
+                          utf_codecs, encoding_detectable, uses_bom,
+                          is_test_codec, test_codec_built_in_equivalent)
+
+from yamllint import decoder
+
+
+test_strings = (
+    "",
+    "y",
+    "yaml",
+    "🇾⁠🇦⁠🇲⁠🇱⁠❗"
+)
+setUpModule = register_test_codecs
+tearDownModule = unregister_test_codecs
+
+
+class EncodingStuffFromCommonTestCase(unittest.TestCase):
+    def test_test_codecs_and_utf_codecs(self):
+        error = "{} failed to correctly encode then decode {}."
+        for string in test_strings:
+            for codec in utf_codecs():
+                self.assertEqual(
+                    string,
+                    string.encode(codec).decode(codec),
+                    msg=error.format(repr(codec), repr(string))
+                )
+
+    def test_is_test_codec(self):
+        self.assertFalse(is_test_codec('utf_32'))
+        self.assertFalse(is_test_codec('utf_32_be'))
+        self.assertTrue(is_test_codec('utf_32_be_sig'))
+        self.assertFalse(is_test_codec('utf_32_le'))
+        self.assertTrue(is_test_codec('utf_32_le_sig'))
+
+        self.assertFalse(is_test_codec('utf_16'))
+        self.assertFalse(is_test_codec('utf_16_be'))
+        self.assertTrue(is_test_codec('utf_16_be_sig'))
+        self.assertFalse(is_test_codec('utf_16_le'))
+        self.assertTrue(is_test_codec('utf_16_le_sig'))
+
+        self.assertFalse(is_test_codec('utf_8'))
+        self.assertFalse(is_test_codec('utf_8_be'))
+
+    def test_test_codec_built_in_equivalent(self):
+        self.assertEqual(
+            'utf_32',
+            test_codec_built_in_equivalent('utf_32_be_sig')
+        )
+        self.assertEqual(
+            'utf_32',
+            test_codec_built_in_equivalent('utf_32_le_sig')
+        )
+
+        self.assertEqual(
+            'utf_16',
+            test_codec_built_in_equivalent('utf_16_be_sig')
+        )
+        self.assertEqual(
+            'utf_16',
+            test_codec_built_in_equivalent('utf_16_le_sig')
+        )
+
+    def test_uses_bom(self):
+        self.assertTrue(uses_bom('utf_32'))
+        self.assertFalse(uses_bom('utf_32_be'))
+        self.assertTrue(uses_bom('utf_32_be_sig'))
+        self.assertFalse(uses_bom('utf_32_le'))
+        self.assertTrue(uses_bom('utf_32_le_sig'))
+
+        self.assertTrue(uses_bom('utf_16'))
+        self.assertFalse(uses_bom('utf_16_be'))
+        self.assertTrue(uses_bom('utf_16_be_sig'))
+        self.assertFalse(uses_bom('utf_16_le'))
+        self.assertTrue(uses_bom('utf_16_le_sig'))
+
+        self.assertFalse(uses_bom('utf_8'))
+        self.assertTrue(uses_bom('utf_8_sig'))
+
+    def test_encoding_detectable(self):
+        # No BOM + nothing
+        self.assertFalse(encoding_detectable('', 'utf_32_be'))
+        self.assertFalse(encoding_detectable('', 'utf_32_le'))
+
+        self.assertFalse(encoding_detectable('', 'utf_16_be'))
+        self.assertFalse(encoding_detectable('', 'utf_16_le'))
+
+        self.assertFalse(encoding_detectable('', 'utf_8'))
+        # BOM + nothing
+        self.assertTrue(encoding_detectable('', 'utf_32'))
+        self.assertTrue(encoding_detectable('', 'utf_32_be_sig'))
+        self.assertTrue(encoding_detectable('', 'utf_32_le_sig'))
+
+        self.assertTrue(encoding_detectable('', 'utf_16'))
+        self.assertTrue(encoding_detectable('', 'utf_16_be_sig'))
+        self.assertTrue(encoding_detectable('', 'utf_16_le_sig'))
+
+        self.assertTrue(encoding_detectable('', 'utf_8_sig'))
+        # No BOM + non-ASCII
+        self.assertFalse(encoding_detectable('Ⓝⓔ', 'utf_32_be'))
+        self.assertFalse(encoding_detectable('ⓥⓔ', 'utf_32_le'))
+
+        self.assertFalse(encoding_detectable('ⓡ ', 'utf_16_be'))
+        self.assertFalse(encoding_detectable('ⓖⓞ', 'utf_16_le'))
+
+        self.assertFalse(encoding_detectable('ⓝⓝ', 'utf_8'))
+        # No BOM + ASCII
+        self.assertTrue(encoding_detectable('a ', 'utf_32_be'))
+        self.assertTrue(encoding_detectable('gi', 'utf_32_le'))
+
+        self.assertTrue(encoding_detectable('ve', 'utf_16_be'))
+        self.assertTrue(encoding_detectable(' y', 'utf_16_le'))
+
+        self.assertTrue(encoding_detectable('ou', 'utf_8'))
+        # BOM + non-ASCII
+        self.assertTrue(encoding_detectable('␣ⓤ', 'utf_32'))
+        self.assertTrue(encoding_detectable('ⓟ␤', 'utf_32_be_sig'))
+        self.assertTrue(encoding_detectable('Ⓝⓔ', 'utf_32_le_sig'))
+
+        self.assertTrue(encoding_detectable('ⓥⓔ', 'utf_16'))
+        self.assertTrue(encoding_detectable('ⓡ␣', 'utf_16_be_sig'))
+        self.assertTrue(encoding_detectable('ⓖⓞ', 'utf_16_le_sig'))
+
+        self.assertTrue(encoding_detectable('ⓝⓝ', 'utf_8_sig'))
+        # BOM + ASCII
+        self.assertTrue(encoding_detectable('a ', 'utf_32'))
+        self.assertTrue(encoding_detectable('le', 'utf_32_be_sig'))
+        self.assertTrue(encoding_detectable('t ', 'utf_32_le_sig'))
+
+        self.assertTrue(encoding_detectable('yo', 'utf_16'))
+        self.assertTrue(encoding_detectable('u ', 'utf_16_be_sig'))
+        self.assertTrue(encoding_detectable('do', 'utf_16_le_sig'))
+
+        self.assertTrue(encoding_detectable('wn', 'utf_8_sig'))
+
+
+class DecoderTestCase(unittest.TestCase):
+    def test_detect_encoding(self):
+        error1 = "{} was encoded with {}, but detect_encoding() returned {}."
+        error2 = "detect_encoding({}) returned a codec that isn’t built-in."
+        for string in test_strings:
+            for codec in utf_codecs():
+                input = string.encode(codec)
+
+                if not uses_bom(codec) and len(string) == 0:
+                    expected_output = 'utf_8'
+                elif not encoding_detectable(string, codec):
+                    expected_output = None
+                elif is_test_codec(codec):
+                    expected_output = test_codec_built_in_equivalent(codec)
+                else:
+                    expected_output = codec
+
+                actual_output = decoder.detect_encoding(input)
+                if expected_output is not None:
+                    self.assertEqual(
+                        expected_output,
+                        actual_output,
+                        msg=error1.format(
+                            input,
+                            repr(codec),
+                            repr(actual_output)
+                        )
+                    )
+
+                codec_info = codecs.lookup(actual_output)
+                self.assertFalse(
+                    is_test_codec(codec_info),
+                    msg=error2.format(input)
+                )
+
+    def test_auto_decode(self):
+        lenient_error_handlers = (
+            'ignore',
+            'replace',
+            'backslashreplace',
+            'surrogateescape',
+        )
+        at_least_one_decode_error = False
+        for string in test_strings:
+            for codec in utf_codecs():
+                input = string.encode(codec)
+                if encoding_detectable(string, codec) or len(string) == 0:
+                    actual_output = decoder.auto_decode(input)
+                    self.assertEqual(
+                        string,
+                        actual_output,
+                        msg=f"auto_decode({input}) returned the wrong value."
+                    )
+                    self.assertIsInstance(actual_output, str)
+                else:
+                    try:
+                        decoder.auto_decode(input)
+                    except UnicodeDecodeError:
+                        at_least_one_decode_error = True
+
+                for handler in lenient_error_handlers:
+                    actual_output = decoder.auto_decode(input, errors=handler)
+                    self.assertIsInstance(actual_output, str)
+        self.assertTrue(
+            at_least_one_decode_error,
+            msg="None of the test_strings triggered a decoding error."
+        )
diff --git a/yamllint/cli.py b/yamllint/cli.py
index 604e5940..8de3c55a 100644
--- a/yamllint/cli.py
+++ b/yamllint/cli.py
@@ -219,7 +219,7 @@ def run(argv=None):
     for file in find_files_recursively(args.files, conf):
         filepath = file[2:] if file.startswith('./') else file
         try:
-            with open(file, newline='') as f:
+            with open(file, mode='rb') as f:
                 problems = linter.run(f, conf, filepath)
         except OSError as e:
             print(e, file=sys.stderr)
diff --git a/yamllint/config.py b/yamllint/config.py
index b07229f5..45ea3c3d 100644
--- a/yamllint/config.py
+++ b/yamllint/config.py
@@ -19,6 +19,7 @@
 import pathspec
 import yaml
 
+from yamllint import decoder
 import yamllint.rules
 
 
@@ -38,8 +39,8 @@ def __init__(self, content=None, file=None):
         self.locale = None
 
         if file is not None:
-            with open(file) as f:
-                content = f.read()
+            with open(file, mode='rb') as f:
+                content = decoder.auto_decode(f.read())
 
         self.parse(content)
         self.validate()
diff --git a/yamllint/decoder.py b/yamllint/decoder.py
new file mode 100644
index 00000000..47eb9988
--- /dev/null
+++ b/yamllint/decoder.py
@@ -0,0 +1,60 @@
+# Copyright (C) 2023 Jason Yundt
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import codecs
+
+
+def detect_encoding(stream_data):
+    """
+    Return stream_data’s character encoding
+
+    Specifically, this function will take a bytes object and return a string
+    that contains the name of one of Python’s built-in codecs [1].
+
+    The YAML spec says that streams must begin with a BOM or an ASCII
+    character. If stream_data doesn’t begin with either of those, then this
+    function might return the wrong encoding. See chapter 5.2 of the YAML spec
+    for details [2].
+
+    [1]: <https://docs.python.org/3/library/codecs.html#standard-encodings>
+    [2]: <https://yaml.org/spec/1.2.2/#52-character-encodings>
+    """
+    if stream_data.startswith(codecs.BOM_UTF32_BE):
+        return 'utf_32'
+    elif stream_data.startswith(b'\x00\x00\x00') and len(stream_data) >= 4:
+        return 'utf_32_be'
+    elif stream_data.startswith(codecs.BOM_UTF32_LE):
+        return 'utf_32'
+    elif stream_data[1:4] == b'\x00\x00\x00':
+        return 'utf_32_le'
+    elif stream_data.startswith(codecs.BOM_UTF16_BE):
+        return 'utf_16'
+    elif stream_data.startswith(b'\x00') and len(stream_data) >= 2:
+        return 'utf_16_be'
+    elif stream_data.startswith(codecs.BOM_UTF16_LE):
+        return 'utf_16'
+    elif stream_data[1:2] == b'\x00':
+        return 'utf_16_le'
+    elif stream_data.startswith(codecs.BOM_UTF8):
+        return 'utf_8_sig'
+    else:
+        return 'utf_8'
+
+
+def auto_decode(stream_data, errors='strict'):
+    return stream_data.decode(
+        encoding=detect_encoding(stream_data),
+        errors=errors
+    )
diff --git a/yamllint/linter.py b/yamllint/linter.py
index 0de1f716..caf5111e 100644
--- a/yamllint/linter.py
+++ b/yamllint/linter.py
@@ -18,6 +18,7 @@
 
 import yaml
 
+from yamllint import decoder
 from yamllint import parser
 
 
@@ -188,6 +189,8 @@ def get_syntax_error(buffer):
 def _run(buffer, conf, filepath):
     assert hasattr(buffer, '__getitem__'), \
         '_run() argument must be a buffer, not a stream'
+    if isinstance(buffer, bytes):
+        buffer = decoder.auto_decode(buffer)
 
     first_line = next(parser.line_generator(buffer)).content
     if re.match(r'^#\s*yamllint disable-file\s*$', first_line):