-
Notifications
You must be signed in to change notification settings - Fork 3
/
hanzi_counter.py
143 lines (128 loc) · 4.9 KB
/
hanzi_counter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import pygal
import pypinyin
from pyhanlp import *
from typing import Union, Set, Dict, List, Any, Tuple, Optional
from svg import SVG
import os
import sys
import json
import argparse
import tempfile
from collections import defaultdict
import numpy as np
import re
from pprint import pprint
PWD = os.path.abspath(os.path.dirname(__file__))
def draw_svg(svg_path, title, counter):
line_chart = pygal.Line()
line_chart.add(title, [kv[1] for kv in counter])
line_chart.render_to_file(svg_path)
pinyin2shengyun_cache = {
'a': ['a', 'a'],
'o': ['o', 'o'],
'e': ['e', 'e'],
'ai': ['a', 'i'],
'an': ['a', 'n'],
'ao': ['a', 'o'],
'ei': ['e', 'i'],
'en': ['e', 'n'],
'er': ['e', 'r'],
'ou': ['o', 'u'],
'ang': ['a', 'h'],
'eng': ['e', 'g'],
}
def pinyin2shengyun(pinyin):
if pinyin in pinyin2shengyun_cache:
return pinyin2shengyun_cache[pinyin]
shengs = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'w', 'x', 'y', 'z', 'ch', 'sh', 'zh']
yuns = ['a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'i', 'ia', 'ian', 'iang', 'iao', 'ie', 'iong', 'in', 'ing', 'iu', 'o', 'ong', 'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'ue', 'ui', 'un', 'uo', 'v', 've']
for yun in yuns:
if not pinyin.endswith(yun):
continue
idx = len(pinyin) - len(yun)
sheng = pinyin[:idx]
if sheng not in shengs:
continue
print(f'{pinyin} -> {sheng},{yun}')
pinyin2shengyun_cache[pinyin] = [sheng, yun]
return pinyin2shengyun_cache[pinyin]
if __name__ == '__main__':
with open(f'{PWD}/data/sample4.txt') as f:
pinyins = set()
pinyin2hanzi = {}
lines = f.readlines()
for line in lines:
line = line.strip()
try:
hanzi = line[0]
len0 = len(pinyins)
pys = [re.sub('[0-9]', '', py).lower() for py in line.split(' ')[1].split('/')]
pinyins.update(pys)
if len(pinyins) > len0:
for py in pys:
if py not in pinyin2hanzi:
pinyin2hanzi[py] = hanzi
except Exception as e:
pass
for py, hz in pinyin2hanzi.items():
print(f'{hz} {py}')
exit(0)
path = '/home/tzx/git/feeder/dump/zhihu_2020-02-02_13-39-01.149.txt'
# path = '/home/tzx/git/feeder/dump/zhihu_dump_1580381555.txt'
with open(path) as f:
lines = f.readlines()
with open(f'{path}_strip.txt', 'w') as f:
for line in lines:
line = re.sub("[\u0000-\u007f]", "", line)
# line = re.sub("阅读更多", "\n", line)
f.write(line)
f.write('\n')
exit(0)
char_counter = defaultdict(int)
word_counter = defaultdict(int)
shengyun_counter = defaultdict(int)
sheng2yun_counter = {}
for line in lines:
# print('\n\n\n', line)
line = re.sub("[\u0000-\u007f]", "", line)
for c in line:
char_counter[c] += 1
try:
pinyin = pypinyin.pinyin(c, style=pypinyin.Style.NORMAL, errors='ignore')[0][0]
sheng, yun = pinyin2shengyun(pinyin)
# print(f'{c} -> {pinyin} -> {sheng},{yun}')
shengyun_counter[sheng] += 1
shengyun_counter[yun] += 1
if sheng not in sheng2yun_counter:
sheng2yun_counter[sheng] = defaultdict(int)
sheng2yun_counter[sheng][yun] += 1
except Exception as e:
pass
segments = HanLP.segment(line)
for seg_idx, seg in enumerate(segments):
seg = str(seg)
idx = seg.rfind('/')
hanzi = seg[:idx]
word_counter[hanzi] += 1
char2count = [[k, v] for k, v in char_counter.items()]
char2count.sort(key=lambda kv: kv[1], reverse=True)
# pprint(char2count[:20])
with open('/tmp/char.txt', 'w') as f:
for index, (char, count) in enumerate(char2count):
f.write(f'{index}\t{char}\t{count}\n')
draw_svg('/tmp/char.svg', '字', char2count[:100])
shengyun2count = [[k, v] for k, v in shengyun_counter.items()]
shengyun2count.sort(key=lambda kv: kv[1], reverse=True)
with open('/tmp/shengyun.txt', 'w') as f:
for index, (char, count) in enumerate(shengyun2count):
f.write(f'{index}\t{char}\t{count}\n')
f.write('### sheng -> yun -> count\n')
f.write(json.dumps(sheng2yun_counter, indent=4))
draw_svg('/tmp/shengyun.svg', '字', shengyun2count[:100])
word2count = [[k, v] for k, v in word_counter.items()]
word2count.sort(key=lambda kv: kv[1], reverse=True)
# pprint(word2count[:20])
with open('/tmp/word.txt', 'w') as f:
for index, (word, count) in enumerate(word2count):
f.write(f'{index}\t{word}\t{count}\n')
draw_svg('/tmp/word.svg', '词', word2count[:100])