Skip to content

Commit b9ee668

Browse files
committed
change dict patch logic
auto detect env with special unihans and patch corresponding dict. Ref #3
1 parent 9b848cb commit b9ee668

File tree

6 files changed

+175
-165
lines changed

6 files changed

+175
-165
lines changed

scripts/webpack.config.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ const ROOT = path.resolve(__dirname, '..')
2929
module.exports = [
3030
// browser version tiny-pinyin
3131
{
32-
entry: path.resolve(ROOT, 'src/browser.js'),
32+
entry: path.resolve(ROOT, 'src/patched.js'),
3333
output: {
3434
filename: 'docs/browser.js',
3535
path: ROOT,

src/browser.js

Lines changed: 0 additions & 15 deletions
This file was deleted.

src/core.js

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
'use strict'
2+
3+
const DICT = require('./dict')
4+
5+
const FIRST_PINYIN_UNIHAN = '\u963F'
6+
const LAST_PINYIN_UNIHAN = '\u9FFF'
7+
8+
const LATIN = 1
9+
const PINYIN = 2
10+
const UNKNOWN = 3
11+
12+
let supported = null
13+
let COLLATOR
14+
15+
function patchDict (patchers) {
16+
if (!patchers) return
17+
if (typeof patchers === 'function') {
18+
patchers = [patchers]
19+
}
20+
if (patchers.forEach) {
21+
patchers.forEach(p => {
22+
typeof p === 'function' && p(DICT)
23+
})
24+
}
25+
}
26+
27+
function isSupported (force) {
28+
if (!force && supported !== null) {
29+
return supported
30+
}
31+
if (typeof Intl === 'object' && Intl.Collator) {
32+
COLLATOR = new Intl.Collator(['zh-Hans-CN', 'zh-CN'])
33+
supported = Intl.Collator.supportedLocalesOf(['zh-CN']).length === 1
34+
} else {
35+
supported = false
36+
}
37+
return supported
38+
}
39+
40+
function genToken (ch) {
41+
// Access DICT here, give the chance to patch DICT.
42+
const UNIHANS = DICT.UNIHANS
43+
const PINYINS = DICT.PINYINS
44+
const EXCEPTIONS = DICT.EXCEPTIONS
45+
const token = {
46+
source: ch
47+
}
48+
49+
// First check EXCEPTIONS map, then search with UNIHANS table.
50+
if (ch in EXCEPTIONS) {
51+
token.type = PINYIN
52+
token.target = EXCEPTIONS[ch]
53+
return token
54+
}
55+
56+
let offset = -1
57+
let cmp
58+
if (ch.charCodeAt(0) < 256) {
59+
token.type = LATIN
60+
token.target = ch
61+
return token
62+
} else {
63+
cmp = COLLATOR.compare(ch, FIRST_PINYIN_UNIHAN)
64+
if (cmp < 0) {
65+
token.type = UNKNOWN
66+
token.target = ch
67+
return token
68+
} else if (cmp === 0) {
69+
token.type = PINYIN
70+
offset = 0
71+
} else {
72+
cmp = COLLATOR.compare(ch, LAST_PINYIN_UNIHAN)
73+
if (cmp > 0) {
74+
token.type = UNKNOWN
75+
token.target = ch
76+
return token
77+
} else if (cmp === 0) {
78+
token.type = PINYIN
79+
offset = UNIHANS.length - 1
80+
}
81+
}
82+
}
83+
84+
token.type = PINYIN
85+
if (offset < 0) {
86+
let begin = 0
87+
let end = UNIHANS.length - 1
88+
while (begin <= end) {
89+
offset = ~~((begin + end) / 2)
90+
let unihan = UNIHANS[offset]
91+
cmp = COLLATOR.compare(ch, unihan)
92+
93+
// Catch it.
94+
if (cmp === 0) {
95+
break
96+
}
97+
// Search after offset.
98+
else if (cmp > 0) {
99+
begin = offset + 1
100+
}
101+
// Search before the offset.
102+
else {
103+
end = offset - 1
104+
}
105+
}
106+
}
107+
108+
if (cmp < 0) {
109+
offset--
110+
}
111+
112+
token.target = PINYINS[offset]
113+
if (!token.target) {
114+
token.type = UNKNOWN
115+
token.target = token.source
116+
}
117+
return token
118+
}
119+
120+
function parse (str) {
121+
if (typeof str !== 'string') {
122+
throw new Error('argument should be string.')
123+
}
124+
if (!isSupported()) {
125+
throw new Error('not support Intl or zh-CN language.')
126+
}
127+
return str.split('').map(v => genToken(v))
128+
}
129+
130+
module.exports = {
131+
isSupported,
132+
parse,
133+
patchDict,
134+
genToken, // inner usage
135+
convertToPinyin (str, separator, lowerCase) {
136+
return parse(str).map(v => {
137+
if (lowerCase && v.type === PINYIN) {
138+
return v.target.toLowerCase()
139+
}
140+
return v.target
141+
}).join(separator || '')
142+
}
143+
}

src/index.js

Lines changed: 6 additions & 137 deletions
Original file line numberDiff line numberDiff line change
@@ -1,142 +1,11 @@
11
'use strict'
22

3-
const DICT = require('./dict')
3+
const pinyin = require('./core')
4+
const patcher56L = require('./patchers/56l')
45

5-
const FIRST_PINYIN_UNIHAN = '\u963F'
6-
const LAST_PINYIN_UNIHAN = '\u9FFF'
7-
8-
const LATIN = 1
9-
const PINYIN = 2
10-
const UNKNOWN = 3
11-
12-
let supported = null
13-
let COLLATOR
14-
15-
function patchDict (patchers) {
16-
if (!patchers) return
17-
if (typeof patchers === 'function') {
18-
patchers = [patchers]
19-
}
20-
if (patchers.forEach) {
21-
patchers.forEach(p => {
22-
typeof p === 'function' && p(DICT)
23-
})
24-
}
25-
}
26-
27-
function isSupported (force) {
28-
if (!force && supported !== null) {
29-
return supported
30-
}
31-
if (typeof Intl === 'object' && Intl.Collator) {
32-
COLLATOR = new Intl.Collator(['zh-Hans-CN', 'zh-CN'])
33-
supported = Intl.Collator.supportedLocalesOf(['zh-CN']).length === 1
34-
} else {
35-
supported = false
36-
}
37-
return supported
38-
}
39-
40-
function genToken (ch) {
41-
// Access DICT here, give the chance to patch DICT.
42-
const UNIHANS = DICT.UNIHANS
43-
const PINYINS = DICT.PINYINS
44-
const EXCEPTIONS = DICT.EXCEPTIONS
45-
const token = {
46-
source: ch
47-
}
48-
49-
// First check EXCEPTIONS map, then search with UNIHANS table.
50-
if (ch in EXCEPTIONS) {
51-
token.type = PINYIN
52-
token.target = EXCEPTIONS[ch]
53-
return token
54-
}
55-
56-
let offset = -1
57-
let cmp
58-
if (ch.charCodeAt(0) < 256) {
59-
token.type = LATIN
60-
token.target = ch
61-
return token
62-
} else {
63-
cmp = COLLATOR.compare(ch, FIRST_PINYIN_UNIHAN)
64-
if (cmp < 0) {
65-
token.type = UNKNOWN
66-
token.target = ch
67-
return token
68-
} else if (cmp === 0) {
69-
token.type = PINYIN
70-
offset = 0
71-
} else {
72-
cmp = COLLATOR.compare(ch, LAST_PINYIN_UNIHAN)
73-
if (cmp > 0) {
74-
token.type = UNKNOWN
75-
token.target = ch
76-
return token
77-
} else if (cmp === 0) {
78-
token.type = PINYIN
79-
offset = UNIHANS.length - 1
80-
}
81-
}
82-
}
83-
84-
token.type = PINYIN
85-
if (offset < 0) {
86-
let begin = 0
87-
let end = UNIHANS.length - 1
88-
while (begin <= end) {
89-
offset = ~~((begin + end) / 2)
90-
let unihan = UNIHANS[offset]
91-
cmp = COLLATOR.compare(ch, unihan)
92-
93-
// Catch it.
94-
if (cmp === 0) {
95-
break
96-
}
97-
// Search after offset.
98-
else if (cmp > 0) {
99-
begin = offset + 1
100-
}
101-
// Search before the offset.
102-
else {
103-
end = offset - 1
104-
}
105-
}
106-
}
107-
108-
if (cmp < 0) {
109-
offset--
110-
}
111-
112-
token.target = PINYINS[offset]
113-
if (!token.target) {
114-
token.type = UNKNOWN
115-
token.target = token.source
116-
}
117-
return token
118-
}
119-
120-
function parse (str) {
121-
if (typeof str !== 'string') {
122-
throw new Error('argument should be string.')
123-
}
124-
if (!isSupported()) {
125-
throw new Error('not support Intl or zh-CN language.')
126-
}
127-
return str.split('').map(v => genToken(v))
6+
// Patch dict for icudt56l.dat related env, such as safari|node v4.
7+
if (pinyin.isSupported() && patcher56L.shouldPatch(pinyin.genToken)) {
8+
pinyin.patchDict(patcher56L)
1289
}
12910

130-
module.exports = {
131-
isSupported,
132-
parse,
133-
patchDict,
134-
convertToPinyin (str, separator, lowerCase) {
135-
return parse(str).map(v => {
136-
if (lowerCase && v.type === PINYIN) {
137-
return v.target.toLowerCase()
138-
}
139-
return v.target
140-
}).join(separator || '')
141-
}
142-
}
11+
module.exports = pinyin

src/patchers/safari.js renamed to src/patchers/56l.js

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
'use strict'
22

3-
module.exports = function patcher (DICT) {
3+
exports = module.exports = function patcher (DICT) {
44
// Update EXCEPTIONS dict.
55
DICT.EXCEPTIONS = {
66
'\u55f2': 'DIA', // DIE 嗲
@@ -37,3 +37,26 @@ module.exports = function patcher (DICT) {
3737
DICT.UNIHANS[252] = '\u5a1d' // POU: 剖 --> 娝
3838
DICT.UNIHANS[330] = '\u5078' // TOU: 偷 --> 偸
3939
}
40+
41+
exports.shouldPatch = function shouldPatch (toToken) {
42+
if (typeof toToken !== 'function') return false
43+
// Special unihans that get incorrect pinyins.
44+
if (
45+
toToken('\u4f15').target === 'FOU'
46+
&& toToken('\u4eda').target === 'XIA'
47+
&& toToken('\u8bcc').target === 'ZHONG'
48+
&& toToken('\u5a64').target === 'CHONG'
49+
&& toToken('\u8160').target === 'CONG'
50+
&& toToken('\u6538').target === 'YONG'
51+
&& toToken('\u4e6f').target === 'HOU'
52+
&& toToken('\u5215').target === 'LENG'
53+
&& toToken('\u4f5d').target === 'GONG'
54+
&& toToken('\u72bf').target === 'HUAI'
55+
&& toToken('\u5217').target === 'LIAO'
56+
&& toToken('\u5222').target === 'LIN'
57+
&& toToken('\u94b6').target === 'E'
58+
) {
59+
return true
60+
}
61+
return false
62+
}

test/index.spec.js

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,8 @@
11
const assert = require('assert')
22
const { polyphone, common } = require('./hanziDict')
3-
let PinYin
3+
const PinYin = require('../src')
44

55
describe('PinYin', () => {
6-
before(done => {
7-
// Dynamically load lib
8-
if (typeof window === 'object' && window.window === window) {
9-
PinYin = require('../src/browser')
10-
} else {
11-
PinYin = require('../src/index')
12-
}
13-
done()
14-
})
15-
166
describe('#isSupported()', () => {
177
it('should return true when supported', () => {
188
assert(PinYin.isSupported() === true)

0 commit comments

Comments
 (0)