decoder: Autodetect detect encoding of YAML files

Before this change, yamllint would open YAML files using open()’s default encoding. As long as UTF-8 mode isn’t enabled, open() defaults to using the system’s locale encoding [1][2]. Most of the time, the locale encoding on Linux systems is UTF-8 [3][4], but it doesn’t have to be [5]. Additionally, the locale encoding on Windows systems is the system’s ANSI code page [6]. As a result, you would have to either enable UTF-8 mode, give Python a custom manifest or enable a beta feature in Windows settings in order to lint UTF-8 YAML files on Windows [2][7]. Finally, using open()’s default encoding is a violation of the YAML spec. Chapter 5.2 says: “On input, a YAML processor must support the UTF-8 and UTF-16 character encodings. For JSON compatibility, the UTF-32 encodings must also be supported. If a character stream begins with a byte order mark, the character encoding will be taken to be as indicated by the byte order mark. Otherwise, the stream must begin with an ASCII character. This allows the encoding to be deduced by the pattern of null (x00) characters.” [8] This change fixes all of those problems by implementing the YAML spec’s character encoding detection algorithm. Now, as long as YAML files begins with either a byte order mark or an ASCII character, yamllint will automatically detect them as being UTF-8, UTF-16 or UTF-32. Other character encodings are not supported at the moment. Fixes adrienverge#218. Fixes adrienverge#238. Fixes adrienverge#347. [1]: <https://docs.python.org/3.12/library/functions.html#open> [2]: <https://docs.python.org/3.12/library/os.html#utf8-mode> [3]: <https://sourceware.org/glibc/manual/html_node/Extended-Char-Intro.html> [4]: <https://wiki.musl-libc.org/functional-differences-from-glibc.html#Character-sets-and-locale> [5]: <https://sourceware.org/git/?p=glibc.git;a=blob;f=localedata/SUPPORTED;h=c8b63cc2fe2b4547f2fb1bff6193da68d70bd563;hb=36f2487f13e3540be9ee0fb51876b1da72176d3f> [6]: <https://docs.python.org/3.12/glossary.html#term-locale-encoding> [7]: <https://learn.microsoft.com/en-us/windows/apps/design/globalizing/use-utf8-code-page> [8]: <https://yaml.org/spec/1.2.2/#52-character-encodings>
Jayman2000 · Jan 3, 2024 · c317ea9 · c317ea9
1 parent 0a10d50
commit c317ea9
Show file tree

Hide file tree

Showing 7 changed files with 447 additions and 4 deletions.
diff --git a/tests/common.py b/tests/common.py
@@ -13,18 +13,24 @@
 # You should have received a copy of the GNU General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 
+import codecs
+from codecs import CodecInfo as CI
+import collections
 import contextlib
 import os
 import shutil
+import sys
 import tempfile
 import unittest
+import warnings
 
 import yaml
 
 from yamllint.config import YamlLintConfig
 from yamllint import linter
 
 
+# Rule related stuff:
 class RuleTestCase(unittest.TestCase):
  def build_fake_config(self, conf):
  if conf is None:
@@ -54,6 +60,10 @@ def check(self, source, conf, **kwargs):
  self.assertEqual(real_problems, expected_problems)
 
 
+# Workspace related stuff:
+Blob = collections.namedtuple('Blob', ('text', 'encoding'))
+
+
 def build_temp_workspace(files):
  tempdir = tempfile.mkdtemp(prefix='yamllint-tests-')
 
@@ -65,6 +75,8 @@ def build_temp_workspace(files):
  if type(content) is list:
  os.mkdir(path)
  else:
+ if isinstance(content, Blob):
+ content = content.text.encode(content.encoding)
  mode = 'wb' if isinstance(content, bytes) else 'w'
  with open(path, mode) as f:
  f.write(content)
@@ -84,3 +96,98 @@ def temp_workspace(files):
  finally:
  os.chdir(backup_wd)
  shutil.rmtree(wd)
+
+
+# Encoding related stuff:
+def encode_utf_32_be_sig(obj, errors='strict'):
+ return (
+ codecs.BOM_UTF32_BE + codecs.encode(obj, 'utf_32_be', errors),
+ len(obj)
+ )
+
+
+def encode_utf_32_le_sig(obj, errors='strict'):
+ return (
+ codecs.BOM_UTF32_LE + codecs.encode(obj, 'utf_32_le', errors),
+ len(obj)
+ )
+
+
+def encode_utf_16_be_sig(obj, errors='strict'):
+ return (
+ codecs.BOM_UTF16_BE + codecs.encode(obj, 'utf_16_be', errors),
+ len(obj)
+ )
+
+
+def encode_utf_16_le_sig(obj, errors='strict'):
+ return (
+ codecs.BOM_UTF16_LE + codecs.encode(obj, 'utf_16_le', errors),
+ len(obj)
+ )
+
+
+test_codec_infos = {
+ 'utf_32_be_sig': CI(encode_utf_32_be_sig, codecs.getdecoder('utf_32')),
+ 'utf_32_le_sig': CI(encode_utf_32_le_sig, codecs.getdecoder('utf_32')),
+ 'utf_16_be_sig': CI(encode_utf_16_be_sig, codecs.getdecoder('utf_16')),
+ 'utf_16_le_sig': CI(encode_utf_16_le_sig, codecs.getdecoder('utf_16')),
+}
+
+
+def register_test_codecs():
+ codecs.register(test_codec_infos.get)
+
+
+def unregister_test_codecs():
+ if sys.version_info >= (3, 10, 0):
+ codecs.unregister(test_codec_infos.get)
+ else:
+ warnings.warn(
+ "This version of Python doesn’t allow us to unregister codecs."
+ )
+
+
+def is_test_codec(codec):
+ return codec in test_codec_infos.keys()
+
+
+def test_codec_built_in_equivalent(test_codec):
+ return_value = test_codec
+ for suffix in ('_sig', '_be', '_le'):
+ return_value = return_value.replace(suffix, '')
+ return return_value
+
+
+def uses_bom(codec):
+ for suffix in ('_32', '_16', '_sig'):
+ if codec.endswith(suffix):
+ return True
+ return False
+
+
+def encoding_detectable(string, codec):
+ """
+ Returns True if encoding can be detected after string is encoded
+
+ Encoding detection only works if you’re using a BOM or the first character
+ is ASCII. See yamllint.decoder.auto_decode()’s docstring.
+ """
+ return uses_bom(codec) or (len(string) > 0 and string[0].isascii())
+
+
+def utf_codecs():
+ for chunk_size in ('32', '16'):
+ for endianness in ('be', 'le'):
+ for sig in ('', '_sig'):
+ yield f'utf_{chunk_size}_{endianness}{sig}'
+ yield 'utf_8_sig'
+ yield 'utf_8'
+
+
+def ws_with_files_in_many_codecs(path_template, text):
+ workspace = {}
+ for codec in utf_codecs():
+ if encoding_detectable(text, codec):
+ workspace[path_template.format(codec)] = Blob(text, codec)
+ return workspace
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -23,7 +23,9 @@
 import tempfile
 import unittest
 
-from tests.common import build_temp_workspace, temp_workspace
+from tests.common import (build_temp_workspace, temp_workspace,
+ ws_with_files_in_many_codecs,
+ register_test_codecs, unregister_test_codecs)
 
 from yamllint import cli
 from yamllint import config
@@ -797,3 +799,52 @@ def test_multiple_parent_config_file(self):
  self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr),
  (0, './4spaces.yml:2:5: [warning] wrong indentation: '
  'expected 3 but found 4 (indentation)\n', ''))
+
+
+class CommandLineEncodingTestCase(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ super().setUpClass()
+ register_test_codecs()
+
+ @classmethod
+ def tearDownClass(cls):
+ super().tearDownClass()
+ unregister_test_codecs()
+
+ def test_valid_encodings(self):
+ conf = ('---\n'
+ 'rules:\n'
+ ' key-ordering: enable\n')
+ config_files = ws_with_files_in_many_codecs(
+ 'config_{}.yaml',
+ conf
+ )
+ sorted_correctly = ('---\n'
+ 'Ａ: YAML\n'
+ 'Ｚ: YAML\n')
+ sorted_correctly_files = ws_with_files_in_many_codecs(
+ 'sorted_correctly/{}.yaml',
+ sorted_correctly
+ )
+ sorted_incorrectly = ('---\n'
+ 'Ｚ: YAML\n'
+ 'Ａ: YAML\n')
+ sorted_incorrectly_files = ws_with_files_in_many_codecs(
+ 'sorted_incorrectly/{}.yaml',
+ sorted_incorrectly
+ )
+ workspace = {
+ **config_files,
+ **sorted_correctly_files,
+ **sorted_incorrectly_files
+ }
+
+ with temp_workspace(workspace):
+ for config_path in config_files.keys():
+ with RunContext(self) as ctx:
+ cli.run(('-c', config_path, 'sorted_correctly/'))
+ self.assertEqual(ctx.returncode, 0)
+ with RunContext(self) as ctx:
+ cli.run(('-c', config_path, 'sorted_incorrectly/'))
+ self.assertNotEqual(ctx.returncode, 0)