Skip to content

Commit

Permalink
decoder: Autodetect detect encoding of YAML files
Browse files Browse the repository at this point in the history
Before this change, yamllint would open YAML files using open()’s
default encoding. As long as UTF-8 mode isn’t enabled, open() defaults
to using the system’s locale encoding [1][2].

Most of the time, the locale encoding on Linux systems is UTF-8 [3][4],
but it doesn’t have to be [5]. Additionally, the locale encoding on
Windows systems is the system’s ANSI code page [6]. As a result, you
would have to either enable UTF-8 mode, give Python a custom manifest or
enable a beta feature in Windows settings in order to lint UTF-8 YAML
files on Windows [2][7].

Finally, using open()’s default encoding is a violation of the YAML
spec. Chapter 5.2 says:

	“On input, a YAML processor must support the UTF-8 and UTF-16
	character encodings. For JSON compatibility, the UTF-32
	encodings must also be supported.

	If a character stream begins with a byte order mark, the
	character encoding will be taken to be as indicated by the byte
	order mark. Otherwise, the stream must begin with an ASCII
	character. This allows the encoding to be deduced by the pattern
	of null (x00) characters.” [8]

This change fixes all of those problems by implementing the YAML spec’s
character encoding detection algorithm. Now, as long as YAML files
begins with either a byte order mark or an ASCII character, yamllint
will automatically detect them as being UTF-8, UTF-16 or UTF-32. Other
character encodings are not supported at the moment.

Fixes adrienverge#218. Fixes adrienverge#238. Fixes adrienverge#347.

[1]: <https://docs.python.org/3.12/library/functions.html#open>
[2]: <https://docs.python.org/3.12/library/os.html#utf8-mode>
[3]: <https://sourceware.org/glibc/manual/html_node/Extended-Char-Intro.html>
[4]: <https://wiki.musl-libc.org/functional-differences-from-glibc.html#Character-sets-and-locale>
[5]: <https://sourceware.org/git/?p=glibc.git;a=blob;f=localedata/SUPPORTED;h=c8b63cc2fe2b4547f2fb1bff6193da68d70bd563;hb=36f2487f13e3540be9ee0fb51876b1da72176d3f>
[6]: <https://docs.python.org/3.12/glossary.html#term-locale-encoding>
[7]: <https://learn.microsoft.com/en-us/windows/apps/design/globalizing/use-utf8-code-page>
[8]: <https://yaml.org/spec/1.2.2/#52-character-encodings>
  • Loading branch information
Jayman2000 committed Jan 3, 2024
1 parent 0a10d50 commit c317ea9
Show file tree
Hide file tree
Showing 7 changed files with 447 additions and 4 deletions.
107 changes: 107 additions & 0 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,24 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import codecs
from codecs import CodecInfo as CI
import collections
import contextlib
import os
import shutil
import sys
import tempfile
import unittest
import warnings

import yaml

from yamllint.config import YamlLintConfig
from yamllint import linter


# Rule related stuff:
class RuleTestCase(unittest.TestCase):
def build_fake_config(self, conf):
if conf is None:
Expand Down Expand Up @@ -54,6 +60,10 @@ def check(self, source, conf, **kwargs):
self.assertEqual(real_problems, expected_problems)


# Workspace related stuff:
Blob = collections.namedtuple('Blob', ('text', 'encoding'))


def build_temp_workspace(files):
tempdir = tempfile.mkdtemp(prefix='yamllint-tests-')

Expand All @@ -65,6 +75,8 @@ def build_temp_workspace(files):
if type(content) is list:
os.mkdir(path)
else:
if isinstance(content, Blob):
content = content.text.encode(content.encoding)
mode = 'wb' if isinstance(content, bytes) else 'w'
with open(path, mode) as f:
f.write(content)
Expand All @@ -84,3 +96,98 @@ def temp_workspace(files):
finally:
os.chdir(backup_wd)
shutil.rmtree(wd)


# Encoding related stuff:
def encode_utf_32_be_sig(obj, errors='strict'):
return (
codecs.BOM_UTF32_BE + codecs.encode(obj, 'utf_32_be', errors),
len(obj)
)


def encode_utf_32_le_sig(obj, errors='strict'):
return (
codecs.BOM_UTF32_LE + codecs.encode(obj, 'utf_32_le', errors),
len(obj)
)


def encode_utf_16_be_sig(obj, errors='strict'):
return (
codecs.BOM_UTF16_BE + codecs.encode(obj, 'utf_16_be', errors),
len(obj)
)


def encode_utf_16_le_sig(obj, errors='strict'):
return (
codecs.BOM_UTF16_LE + codecs.encode(obj, 'utf_16_le', errors),
len(obj)
)


test_codec_infos = {
'utf_32_be_sig': CI(encode_utf_32_be_sig, codecs.getdecoder('utf_32')),
'utf_32_le_sig': CI(encode_utf_32_le_sig, codecs.getdecoder('utf_32')),
'utf_16_be_sig': CI(encode_utf_16_be_sig, codecs.getdecoder('utf_16')),
'utf_16_le_sig': CI(encode_utf_16_le_sig, codecs.getdecoder('utf_16')),
}


def register_test_codecs():
codecs.register(test_codec_infos.get)


def unregister_test_codecs():
if sys.version_info >= (3, 10, 0):
codecs.unregister(test_codec_infos.get)
else:
warnings.warn(
"This version of Python doesn’t allow us to unregister codecs."
)


def is_test_codec(codec):
return codec in test_codec_infos.keys()


def test_codec_built_in_equivalent(test_codec):
return_value = test_codec
for suffix in ('_sig', '_be', '_le'):
return_value = return_value.replace(suffix, '')
return return_value


def uses_bom(codec):
for suffix in ('_32', '_16', '_sig'):
if codec.endswith(suffix):
return True
return False


def encoding_detectable(string, codec):
"""
Returns True if encoding can be detected after string is encoded
Encoding detection only works if you’re using a BOM or the first character
is ASCII. See yamllint.decoder.auto_decode()’s docstring.
"""
return uses_bom(codec) or (len(string) > 0 and string[0].isascii())


def utf_codecs():
for chunk_size in ('32', '16'):
for endianness in ('be', 'le'):
for sig in ('', '_sig'):
yield f'utf_{chunk_size}_{endianness}{sig}'
yield 'utf_8_sig'
yield 'utf_8'


def ws_with_files_in_many_codecs(path_template, text):
workspace = {}
for codec in utf_codecs():
if encoding_detectable(text, codec):
workspace[path_template.format(codec)] = Blob(text, codec)
return workspace
53 changes: 52 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@
import tempfile
import unittest

from tests.common import build_temp_workspace, temp_workspace
from tests.common import (build_temp_workspace, temp_workspace,
ws_with_files_in_many_codecs,
register_test_codecs, unregister_test_codecs)

from yamllint import cli
from yamllint import config
Expand Down Expand Up @@ -797,3 +799,52 @@ def test_multiple_parent_config_file(self):
self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr),
(0, './4spaces.yml:2:5: [warning] wrong indentation: '
'expected 3 but found 4 (indentation)\n', ''))


class CommandLineEncodingTestCase(unittest.TestCase):
@classmethod
def setUpClass(cls):
super().setUpClass()
register_test_codecs()

@classmethod
def tearDownClass(cls):
super().tearDownClass()
unregister_test_codecs()

def test_valid_encodings(self):
conf = ('---\n'
'rules:\n'
' key-ordering: enable\n')
config_files = ws_with_files_in_many_codecs(
'config_{}.yaml',
conf
)
sorted_correctly = ('---\n'
'A: YAML\n'
'Z: YAML\n')
sorted_correctly_files = ws_with_files_in_many_codecs(
'sorted_correctly/{}.yaml',
sorted_correctly
)
sorted_incorrectly = ('---\n'
'Z: YAML\n'
'A: YAML\n')
sorted_incorrectly_files = ws_with_files_in_many_codecs(
'sorted_incorrectly/{}.yaml',
sorted_incorrectly
)
workspace = {
**config_files,
**sorted_correctly_files,
**sorted_incorrectly_files
}

with temp_workspace(workspace):
for config_path in config_files.keys():
with RunContext(self) as ctx:
cli.run(('-c', config_path, 'sorted_correctly/'))
self.assertEqual(ctx.returncode, 0)
with RunContext(self) as ctx:
cli.run(('-c', config_path, 'sorted_incorrectly/'))
self.assertNotEqual(ctx.returncode, 0)

0 comments on commit c317ea9

Please sign in to comment.