-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathgen_csv.py
327 lines (294 loc) · 10.8 KB
/
gen_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# File : mo_models.py
# Author: anyongjin
# Date : 2022/11/12
'''
将设备型号从MarkDown读取为CSV格式的脚本
输出列:设备编号,设备类型,品牌代码,品牌名,型号编码,型号昵称,型号名称,版本名称
'''
import os
import re
import traceback
import pandas as pd
from typing import Optional, List
from os.path import dirname, abspath
from git import Repo # rely on gitpython
class PhoneModel:
def __init__(self):
"""获取手机品牌型号"""
# if origin == 0: # 拉取仓库
# repo = Repo.clone_from(repo_address, repo_path)
# else: # 拉取最新数据
repo = Repo(repo_path)
repo.remotes.origin.pull()
self.new_commit = repo.head.commit.hexsha
print("MobileModels latest commit: " + str(self.new_commit))
os.environ["LATEST_COMMIT"] = self.new_commit
def data_save(self):
# 保存新的commit值
with open('sync.log', 'wt') as f:
f.write(self.new_commit)
repo_path = './MobileModels/'
source_dir = os.path.join(repo_path, 'brands')
device_type: Optional[str] = None # 设备类型:手机,电视,手环
root_brand: Optional[str] = None # 品牌代码
root_brand_title: Optional[str] = None # 品牌名
devc_code: Optional[str] = None # 设备型号代码
devc_code_alias: Optional[str] = None # 设备型号昵称
devc_model_names: List[str] = [] # 设备型号正式名
_re_title = re.compile(r'^#+')
_re_blanks = re.compile(r'\s+')
_re_char = re.compile(r'([+]+|[^\W_])')
_re_word = re.compile(r'([a-zA-Z0-9]+|[^\W_]{,3})')
_re_non_word = re.compile(r'[\W_]+')
# 匹配model和版本的正则
_re_model_ver = re.compile(r'^`(([^`]+)`\s*)+:\s*')
_re_model_item = re.compile(r'`([^`]+)`')
# 匹配设备类型的正则
_re_device_type = re.compile(r'(手机|手表|手环|平板|电视主机|盒子|(智能)?电视|笔记本电脑|设备|穿戴|Mobile|Phone|Pad|Pod|Tablet|Watch|Band|WATCH|Device|\bTV\b|学习智慧屏|智慧屏)')
_device_map = dict(
手机='mob',
mobile='mob',
phone='mob',
电视='tv',
智能电视='tv',
学习智慧屏='pad',
智慧屏='tv',
设备='device',
手表='watch',
手环='band',
Band='band',
笔记本电脑='computer',
tablet='pad',
平板='pad',
电视主机='tv_hub',
盒子='tv_hub'
)
pd_cols = 'model,dtype,brand,brand_title,code,code_alias,model_name,ver_name'.split(',')
pd_rows = []
def _process_h1(line: str):
# 设置设备类型,品牌名
global device_type, root_brand, root_brand_title
assert root_brand, 'root_brand is required'
# 替换无用描述词
line = re.sub(r'(Global|早期|国行)', '', line)
# 查找品牌结束位置
end_pos, device_type = _read_device_type(line)
brand_str = line[: end_pos]
# 只获取长度不小于2的有效单词
words = [mat.group() for mat in re.finditer(r'\w{2,}', brand_str) if len(mat.group()) >= 2]
if not words:
raise ValueError(f'no brand found in h1: {line}')
if len(words) == 1:
root_brand_title = words[0]
return
root_brand_title = root_brand
for w in words:
if root_brand.lower() == w.lower():
continue
root_brand_title = w
break
def _read_device_type(line: str, raise_err: bool = True):
type_mat = _re_device_type.search(line)
if not type_mat:
if raise_err:
raise ValueError(f'unknown h1 format: {line}')
else:
return -1, None
dtype = type_mat.group().lower()
dtype = _device_map.get(dtype, dtype)
return type_mat.start(), dtype
def _process_bold_model(line: str):
'''
处理加粗的设备型号行
:param line:
:return:
'''
global device_type, devc_code, devc_code_alias, devc_model_names
_reset_context('code')
code_mat = re.search(r'\[\`([^`]+)\`\]', line)
code_nmat = re.search(r'\(\`([^`]+)\`\)', line)
md_start, md_end = 0, len(line)
if code_mat:
devc_code = code_mat.group(1)
md_start = code_mat.end()
if code_nmat:
devc_code_alias = code_nmat.group(1)
md_end = code_nmat.start()
model_name = _strip_text(line[md_start: md_end])
# 检查设备类型是否变化
dtype = _read_device_type(model_name, False)[1]
if dtype and dtype != device_type:
device_type = dtype
# 检查是否一行有多个品牌,以/分割
model_names = _try_split_by_splash(model_name)
model_names = [_strip_text(mname) for mname in model_names]
# 检查是否包含品牌,包含则去除
devc_model_names = []
for mname in model_names:
brand_start = mname.find(root_brand)
if brand_start >= 0:
# 型号包含品牌名,去除
mname = _strip_text(mname[brand_start + len(root_brand):])
dtype_mat = _re_device_type.search(mname)
devc_model_names.append(mname)
def _get_ver_name_with_model(ver_full: str, model_name: str):
'''
从最精细的版本中去除型号信息。可能不完全包含版本名称,而是包含版本的一部分
:param ver_full:
:param model_name:
:return:
'''
ver_words = _re_char.finditer(ver_full)
model_first_word = _re_word.search(model_name).group().lower()
ver_start = ver_full.lower().find(model_first_word)
if ver_start < 0:
return ver_full
model_chars = [mat.group() for mat in _re_char.finditer(model_name)]
model_idx = 0
for ver_mat in ver_words:
if ver_mat.start() < ver_start:
continue
if model_idx >= len(model_chars):
return '#' + _strip_text(ver_full[ver_mat.start():])
ver_word = ver_mat.group()
md_word = model_chars[model_idx]
if ver_word.lower() == md_word.lower():
model_idx += 1
continue
clean_ver = _strip_text(ver_full[ver_mat.start():])
return '#' + clean_ver
return '#'
def _strip_text(text: str):
# 去除头部无效字符
start = _re_char.search(text)
if not start:
return ''
text = text[start.start():]
# 去除尾部无效字符
end_pos = len(text) - _re_char.search(text[::-1]).start()
clean_text = text[:end_pos]
# 补全缺失的括号
brackets, prepend, appends = [], [], []
brac_map = {'(': ')', '(': ')', ')': '(', ')': '('}
for c in clean_text:
if c in {'(', '('}:
btype = 1
elif c in {')', ')'}:
btype = 2
else:
continue
if btype == 1:
brackets.append(c)
elif len(brackets) > 0:
brackets.pop()
else:
prepend.append(brac_map[c])
for brac in brackets:
appends.append(brac_map[brac])
return ''.join([*prepend, clean_text, *appends])
def _get_ver_name(ver_full: str):
ver_names, last_err = [], None
for i, mname in enumerate(devc_model_names):
try:
ver_names.append((i, _get_ver_name_with_model(ver_full, mname)))
except ValueError as e:
last_err = e
if not ver_names:
raise last_err
ver_item = sorted(ver_names, key=lambda x: len(x[1]))[0]
return ver_item[1] if not ver_item[0] else f'{ver_item[0]}{ver_item[1]}'
def _try_split_by_splash(type_name: str):
# 检查是否是/分割的多个版本。多个版本一般前几个单词相同
ver_full_names = [vname.strip() for vname in type_name.split('/')]
if len(ver_full_names) > 1:
name1_arr = _re_non_word.split(ver_full_names[0])
name2_arr = _re_non_word.split(ver_full_names[1])
if name1_arr[0] != name2_arr[0]:
# 首个单词不同,不认为是多个版本
return [type_name]
return ver_full_names
def _process_model_ver(line: str, mat: re.Match):
global device_type, root_brand, root_brand_title, devc_code, devc_code_alias, devc_model_names
model_text = mat.group()
models = [m.group(1) for m in _re_model_item.finditer(model_text)]
ver_full = _strip_text(line[mat.end():])
ver_full_names = _try_split_by_splash(ver_full)
for full_name in ver_full_names:
ver_name = _get_ver_name(full_name)
for model in models:
pd_rows.append((model, device_type, root_brand, root_brand_title, devc_code, devc_code_alias,
'|'.join(devc_model_names), ver_name))
def _process_line(line: str):
global device_type
if line.startswith('-'):
return
title_mat = _re_title.search(line)
title_level = len(title_mat.group(0)) if title_mat else 0
pure_line = line[title_level:].strip()
if title_level == 1:
_process_h1(pure_line)
elif title_level == 2:
dtype = _read_device_type(pure_line, False)[1]
if dtype:
device_type = dtype
# 系列,子品牌,不同产品类型
return
elif title_level:
raise ValueError(f'unknown title type: {title_level}, {line}')
elif pure_line.startswith('**') and pure_line.endswith('**'):
_process_bold_model(pure_line[2: -2])
elif detail_mat := _re_model_ver.search(pure_line):
_process_model_ver(pure_line, detail_mat)
else:
raise ValueError(f'unknown line: {line}')
def _reset_context(level: str):
'''
重置上下文: brand, code
:param level:
:return:
'''
global device_type, root_brand_title, devc_code, devc_code_alias, devc_model_names
if level == 'brand' or level == 'all':
device_type = None
root_brand_title = None
if level == 'code' or level == 'all':
devc_code = None
devc_code_alias = None
devc_model_names = []
def sync_brands(name: str):
global root_brand
_reset_context('all')
root_brand = re.split(r'[\W_]+', name)[0].replace('shouji', '')
full_path = os.path.join(source_dir, name)
with open(full_path, 'r', encoding='utf-8') as fdata:
for line in fdata:
try:
line = line.strip()
if not line:
continue
_process_line(line)
except Exception as e:
print(f'exception process {root_brand}: {e}')
traceback.print_exc()
if __name__ == '__main__':
fnames = sorted(os.listdir(source_dir))
for name in fnames:
# if name.endswith('_en.md'):
# continue
print(f'process: {name}')
sync_brands(name)
df = pd.DataFrame(pd_rows, columns=pd_cols)
df.to_csv('./models.csv', index=False, encoding='utf-8-sig')
print('generate complete, out file: ./models.csv')
try:
with open('sync.log', 'rt') as f:
last_commit = f.readline()
except FileNotFoundError:
last_commit = ''
pm = PhoneModel() # 初始 pm=PhoneModel(0),后续更新可不填
if pm.new_commit != last_commit:
pm.data_save()
else:
print("No update, skip.")