Skip to content

Commit d551970

Browse files
authored
support parsing SQL with encodings other than utf8 (pingcap#1312)
1 parent ea70ab7 commit d551970

10 files changed

+335
-44
lines changed

charset/charset.go

+10
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,16 @@ func ValidCharsetAndCollation(cs string, co string) bool {
107107
return ok
108108
}
109109

110+
// GetDefaultCollationLegacy is compatible with the charset support in old version parser.
111+
func GetDefaultCollationLegacy(charset string) (string, error) {
112+
switch strings.ToLower(charset) {
113+
case CharsetUTF8, CharsetUTF8MB4, CharsetASCII, CharsetLatin1, CharsetBin:
114+
return GetDefaultCollation(charset)
115+
default:
116+
return "", errors.Errorf("Unknown charset %s", charset)
117+
}
118+
}
119+
110120
// GetDefaultCollation returns the default collation for charset.
111121
func GetDefaultCollation(charset string) (string, error) {
112122
cs, err := GetCharsetInfo(charset)

charset/encoding.go

+137
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
// Copyright 2021 PingCAP, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
package charset
15+
16+
import (
17+
"strings"
18+
19+
"golang.org/x/text/encoding"
20+
"golang.org/x/text/transform"
21+
)
22+
23+
const (
24+
encodingBufferSizeDefault = 1024
25+
encodingBufferSizeRecycleThreshold = 4 * 1024
26+
27+
encodingDefault = "utf-8"
28+
)
29+
30+
type EncodingLabel string
31+
32+
// Format trim and change the label to lowercase.
33+
func Format(label string) EncodingLabel {
34+
return EncodingLabel(strings.ToLower(strings.Trim(label, "\t\n\r\f ")))
35+
}
36+
37+
// Formatted is used when the label is already trimmed and it is lowercase.
38+
func Formatted(label string) EncodingLabel {
39+
return EncodingLabel(label)
40+
}
41+
42+
// Encoding provide a interface to encode/decode a string with specific encoding.
43+
type Encoding struct {
44+
enc encoding.Encoding
45+
name string
46+
charLength func([]byte) int
47+
buffer []byte
48+
}
49+
50+
// Enabled indicates whether the non-utf8 encoding is used.
51+
func (e *Encoding) Enabled() bool {
52+
return e.enc != nil && e.charLength != nil
53+
}
54+
55+
// Name returns the name of the current encoding.
56+
func (e *Encoding) Name() string {
57+
return e.name
58+
}
59+
60+
// NewEncoding creates a new Encoding.
61+
func NewEncoding(label EncodingLabel) *Encoding {
62+
if len(label) == 0 {
63+
return &Encoding{}
64+
}
65+
e, name := lookup(label)
66+
if e != nil && name != encodingDefault {
67+
return &Encoding{
68+
enc: e,
69+
name: name,
70+
charLength: FindNextCharacterLength(name),
71+
buffer: make([]byte, encodingBufferSizeDefault),
72+
}
73+
}
74+
return &Encoding{name: name}
75+
}
76+
77+
// UpdateEncoding updates to a new Encoding without changing the buffer.
78+
func (e *Encoding) UpdateEncoding(label EncodingLabel) {
79+
enc, name := lookup(label)
80+
e.name = name
81+
if enc != nil && name != encodingDefault {
82+
e.enc = enc
83+
}
84+
if len(e.buffer) == 0 {
85+
e.buffer = make([]byte, encodingBufferSizeDefault)
86+
}
87+
}
88+
89+
// Encode encodes the bytes to a string.
90+
func (e *Encoding) Encode(src []byte) (string, bool) {
91+
return e.transform(e.enc.NewEncoder(), src)
92+
}
93+
94+
// Decode decodes the bytes to a string.
95+
func (e *Encoding) Decode(src []byte) (string, bool) {
96+
return e.transform(e.enc.NewDecoder(), src)
97+
}
98+
99+
func (e *Encoding) transform(transformer transform.Transformer, src []byte) (string, bool) {
100+
if len(e.buffer) < len(src) {
101+
e.buffer = make([]byte, len(src)*2)
102+
}
103+
var destOffset, srcOffset int
104+
ok := true
105+
for {
106+
nextLen := 4
107+
if e.charLength != nil {
108+
nextLen = e.charLength(src[srcOffset:])
109+
}
110+
srcEnd := srcOffset + nextLen
111+
if srcEnd > len(src) {
112+
srcEnd = len(src)
113+
}
114+
nDest, nSrc, err := transformer.Transform(e.buffer[destOffset:], src[srcOffset:srcEnd], false)
115+
destOffset += nDest
116+
srcOffset += nSrc
117+
if err == nil {
118+
if srcOffset >= len(src) {
119+
result := string(e.buffer[:destOffset])
120+
if len(e.buffer) > encodingBufferSizeRecycleThreshold {
121+
// This prevents Encoding from holding too much memory.
122+
e.buffer = make([]byte, encodingBufferSizeDefault)
123+
}
124+
return result, ok
125+
}
126+
} else if err == transform.ErrShortDst {
127+
newDest := make([]byte, len(e.buffer)*2)
128+
copy(newDest, e.buffer)
129+
e.buffer = newDest
130+
} else {
131+
e.buffer[destOffset] = byte('?')
132+
destOffset += 1
133+
srcOffset += 1
134+
ok = false
135+
}
136+
}
137+
}

charset/encoding_table.go

+34-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,11 @@ import (
3131
// leading and trailing whitespace.
3232
func Lookup(label string) (e encoding.Encoding, name string) {
3333
label = strings.ToLower(strings.Trim(label, "\t\n\r\f "))
34-
enc := encodings[label]
34+
return lookup(Formatted(label))
35+
}
36+
37+
func lookup(label EncodingLabel) (e encoding.Encoding, name string) {
38+
enc := encodings[string(label)]
3539
return enc.e, enc.name
3640
}
3741

@@ -258,3 +262,32 @@ var encodings = map[string]struct {
258262
"utf-16le": {unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM), "utf-16le"},
259263
"x-user-defined": {charmap.XUserDefined, "x-user-defined"},
260264
}
265+
266+
// FindNextCharacterLength is used in lexer.peek() to determine the next character length.
267+
func FindNextCharacterLength(label string) func([]byte) int {
268+
if f, ok := encodingNextCharacterLength[label]; ok {
269+
return f
270+
}
271+
return nil
272+
}
273+
274+
var encodingNextCharacterLength = map[string]func([]byte) int{
275+
// https://en.wikipedia.org/wiki/GBK_(character_encoding)#Layout_diagram
276+
"gbk": func(bs []byte) int {
277+
if len(bs) == 0 || bs[0] < 0x80 {
278+
// A byte in the range 00–7F is a single byte that means the same thing as it does in ASCII.
279+
return 1
280+
}
281+
return 2
282+
},
283+
"utf-8": func(bs []byte) int {
284+
if len(bs) == 0 || bs[0] < 0x80 {
285+
return 1
286+
} else if bs[0] < 0xe0 {
287+
return 2
288+
} else if bs[0] < 0xf0 {
289+
return 3
290+
}
291+
return 4
292+
},
293+
}

hintparserimpl.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -129,11 +129,11 @@ func (hp *hintParser) parse(input string, sqlMode mysql.SQLMode, initPos Pos) ([
129129
hp.result = nil
130130
hp.lexer.reset(input[3:])
131131
hp.lexer.SetSQLMode(sqlMode)
132-
hp.lexer.r.p = Pos{
132+
hp.lexer.r.updatePos(Pos{
133133
Line: initPos.Line,
134134
Col: initPos.Col + 3, // skipped the initial '/*+'
135135
Offset: 0,
136-
}
136+
})
137137
hp.lexer.inBangComment = true // skip the final '*/' (we need the '*/' for reporting warnings)
138138

139139
yyhintParse(&hp.lexer, hp)

0 commit comments

Comments
 (0)