-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfontname.py
161 lines (135 loc) · 6.43 KB
/
fontname.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""Read and decode quirk encoded name records from OpenType fonts."""
import os
from enum import Enum
from fontTools import ttLib
__version__ = '1.0.0'
class IssueLevel(Enum):
NONE = 0 # everything is ok
MARK = 1 # encoding mark does not match string data
DATA = 2 # string data has some problems, it cannot decode with any OpenType supported encoding
def decode_name(name_record):
"""Decode a fonttools NameRecord, return (decoded string, issue level, actual encoding if no data issue)"""
raw = name_record.string
encoding = name_record.getEncoding()
# some names are truncated and data is permanently lost, we can only recover the remaining part (eg: 文鼎勘亭流)
def decode(encoding_):
try:
return raw.decode(encoding_), IssueLevel.NONE if encoding_ == encoding else IssueLevel.MARK, encoding_
except UnicodeDecodeError as ex:
if ex.start >= len(raw) - 2:
return raw.decode(encoding_, 'ignore'), IssueLevel.DATA, None
else:
raise ex
# no such case, but if encountered things will break
if encoding is None:
return decode('utf_16_be')
# empty is empty
if len(raw) == 0:
return decode(encoding)
# among all OpenType supported encoding, only <utf_16_be> may contains '\x00' inside
# if a '\x00' found, it might..
if encoding != 'utf_16_be' and b'\x00' in raw.rstrip(b'\x00'):
# ..prepend a redundant 0x00 before every encoded bytes (eg: 微软简中圆)
if encoding != 'utf_16_be' and all(b == 0 for b in raw[0::2]):
try:
return raw[1::2].decode(encoding), IssueLevel.DATA, None
except UnicodeDecodeError: pass
# ..or mistakenly mark <utf_16_be> as other encoding (eg: HG半古印体)
else:
try:
return decode('utf_16_be')
except UnicodeDecodeError: pass
# mistakenly mark <shift_jis>/<big5> as <mac_roman>
# <mac_roman> won't fail on any input, so we must catch this first
if encoding == 'mac_roman' and len([b for b in raw if b > 0x7f]) > 3:
# try big5 fisrt as it more likely to fail
try:
return decode('x_mac_trad_chinese_ttx') # (eg: 華康布丁體(P))
except UnicodeDecodeError: pass
try:
return decode('x_mac_japanese_ttx') # (eg: EPSON 丸ゴシック体M)
except UnicodeDecodeError: pass
# 「恅隋怪,爬——」, the infamous "恅隋xxxx" series (eg: 文鼎粗圆简 = 恅隋棉埴潠翷)
# they decode some "original" strings with incorrect encoding and re-encode to <utf_16_be>
# and in some cases, there would be a redundant character at the end
if encoding == 'utf_16_be' and raw.startswith(b'\x60\x45\x96\x8b'):
try:
decoded = raw.decode('utf_16_be').encode('big5').decode('gb2312', 'replace')
return decoded[:-2] if decoded[-2] == '�' else decoded, IssueLevel.DATA, None
except UnicodeError: pass
# mistakenly mark some other encodings as <utf_16_be>
# it's hard to tell a encoding from <utf_16_be> accurately, we have to match verdor prefixes individually
if encoding == 'utf_16_be':
if raw.startswith(b'\xbb\xaa\xbf\xb5'): # (eg: 华康楷体W5-A)
try:
return decode('gb2312')
except UnicodeDecodeError: pass
if raw.startswith(b'\xb5\xd8\xb1\x64'): # (eg: 華康中黑體(P)-UN)
try:
return decode('big5')
except UnicodeDecodeError: pass
if raw.startswith(b'HanDing'): # (eg: 汉鼎简中楷)
try:
return decode('ascii')
except UnicodeDecodeError: pass
# mistakenly mark <x_mac_simp_chinese_ttx> as <x_mac_japanese_ttx> (eg: 森泽UD新黑 Gb4 DB)
# these two encodings are also hard to distinguish
if encoding == 'x_mac_japanese_ttx' and raw.startswith(b'\xc9\xad\xd4\xf3'):
try:
return decode('x_mac_simp_chinese_ttx')
except UnicodeDecodeError: pass
# mistakenly mark <utf_16_be> as some other encodings (eg: 麗流隷書)
# usually this fails on decoding with marked encoding, but in rarely case it won't
# non-unicode encoding only supports a small set of characters, and use very different code points from unicode
# if the actual encoding is not <utf_16_be>, decoding with it is unlikely to produce a marked encoding encodable content
# skip truncation detection for mininal false positives
if encoding != 'utf_16_be' and len(raw) % 2 == 0:
try:
decoded = raw.decode('utf_16_be')
decoded.encode(encoding)
return decoded, IssueLevel.MARK, 'utf_16_be'
except UnicodeError: pass
# try fonttools' decoder
try:
decoded = name_record.toUnicode()
if decoded == raw.decode(encoding, 'replace'):
return decoded, IssueLevel.NONE, encoding
else:
return decoded, IssueLevel.DATA, None
except UnicodeDecodeError: pass
# mark as 'x_mac_simp_chinese_ttx', but use an extend edition actually (eg: 方正字迹-黄陵野鶴行書 繁U)
# note that this is a data issue in fact, <gbk> is not a OpenType supported encoding
if encoding == 'x_mac_simp_chinese_ttx':
try:
return raw.decode('gbk'), IssueLevel.DATA, None
except UnicodeDecodeError: pass
# try just recover from truncated data (eg: 文鼎勘亭流)
try:
return decode(encoding)
except UnicodeDecodeError: pass
# the last hope (eg: 蘇新詩卵石體簡)
return decode('utf_16_be')
preferred_langs = [
2052, 33, # zh-Hans
1028, 19, # zh-Hant
1041, 11, # jp
1042, 23, # kr
1033, 0, # en-US
]
def get_display_name(font, preferred_langs=preferred_langs):
names = {n.langID: n for n in font['name'].names if n.nameID == 4}
name = next(decode_name(names[lang])[0].strip('\x00') for lang in preferred_langs if lang in names)
return name
def get_display_names(font_path, join=" & ", preferred_langs=preferred_langs):
ext = os.path.splitext(font_path)[1].lower()
if (ext == ".ttc" or ext == ".otc"):
tt = ttLib.TTCollection(font_path)
names = [get_display_name(font, preferred_langs) for font in tt.fonts]
else:
tt = ttLib.TTFont(font_path)
names = [get_display_name(tt, preferred_langs)]
tt.close()
if (join is not None):
return join.join(names)
else:
return(names)