diff --git a/README.md b/README.md index ff69fba8..fc6771c3 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,20 @@ ![alt text](others/img/tushuguancangshu.png) +![alt text](others/img/znjldkd.png) + +![alt text](others/img/kudsvqw.png) + +![alt text](others/img/cqlbtdmdfu.png) + +![alt text](others/img/djbwv.png) + +![alt text](others/img/刚交的朋友.png) + +![alt text](others/img/刚交的好朋友.png) + +![alt text](others/img/刚交的好朋友2.png) + 后续todo: 拆分细胞词库,加上长尾词,重新分词 diff --git a/cn_dicts/base.dict.yaml b/cn_dicts/base.dict.yaml index 980fae87..f37b36df 100644 --- a/cn_dicts/base.dict.yaml +++ b/cn_dicts/base.dict.yaml @@ -9820,6 +9820,7 @@ sort: by_weight 帮人 bang ren 811 邦瑞 bang rui 1 榜三 bang san 6 +棒杀 bang sha 40 谤讪 bang shan 4 榜上 bang shang 499 帮上 bang shang 227 diff --git a/cn_dicts/ext.dict.yaml b/cn_dicts/ext.dict.yaml index ba16e86d..aae387ba 100644 --- a/cn_dicts/ext.dict.yaml +++ b/cn_dicts/ext.dict.yaml @@ -90295,7 +90295,6 @@ sort: by_weight 老作品 lao zuo pin 93 勒贝尔 le bei er 14 乐播投屏 le bo tou ping 94 -了不成 le bu cheng 43 乐不可极 le bu ke ji 2 乐不可言 le bu ke yan 1 勒布朗队 le bu lang dui 2 @@ -90336,7 +90335,6 @@ sort: by_weight 乐高玩具 le gao wan ju 50 乐高星球大战 le gao xing qiu da zhan 12 乐高游戏 le gao you xi 2 -了个寂寞 le ge ji mo 298 乐购超市 le gou chao shi 3 乐观程度 le guan cheng du 8 乐观大方 le guan da fang 2 @@ -90391,17 +90389,13 @@ sort: by_weight 乐呵呵地 le he he de 107 乐呵呵地说 le he he de shuo 8 乐和乐和 le he le he 2 -了很多 le hen duo 9945 乐乎公寓 le hu gong yu 3 乐华七子 le hua qi zi 8 乐极悲生 le ji bei sheng 5 -了几分 le ji fen 229 乐极忘形 le ji wang xing 1 乐极则悲 le ji ze bei 2 乐嘉老师 le jia lao shi 2 乐见其成 le jian qi cheng 100 -了就好 le jiu hao 68 -了就是好 le jiu shi hao 8 乐居房产 le ju fang chan 1 勒卡雷 le ka lei 11 乐开了花 le kai le hua 122 @@ -90425,7 +90419,6 @@ sort: by_weight 勒芒冠军 le mang guan jun 17 勒梅尔 le mei er 2 勒梅特 le mei te 24 -了没有啊 le mei you a 3 乐佩公主 le pei gong zhu 3 乐平公主 le ping gong zhu 2 乐起来 le qi lai 7 @@ -90525,21 +90518,13 @@ sort: by_weight 乐享健康 le xiang jian kang 26 乐享其中 le xiang qi zhong 1 乐享生活 le xiang sheng huo 6 -了新的 le xin de 929 -了一遍 le yi bian 2349 乐意不乐意 le yi bu le yi 8 -了一场 le yi chang 580 乐意分享 le yi fen xiang 48 乐意奉陪 le yi feng pei 7 -了一会 le yi hui 544 -了一会儿 le yi hui er 539 乐意接受 le yi jie shou 76 乐意来 le yi lai 5 乐意吗 le yi ma 14 乐意配合 le yi pei he 4 -了一声 le yi sheng 789 -了一下 le yi xia 13631 -了一些 le yi xie 6893 乐意之至 le yi zhi zhi 6 乐游记 le you ji 2 乐于帮助 le yu bang zhu 46 @@ -90565,10 +90550,8 @@ sort: by_weight 乐园内 le yuan nei 4 乐园游玩 le yuan you wan 2 乐园中 le yuan zhong 18 -了这个 le zhe ge 3144 乐中乐 le zhong le 2 乐卓博大学 le zhuo bo da xue 1 -了自己 le zi ji 3584 乐子人 le zi ren 117 乐滋滋地 le zi zi de 7 雷安娜 lei an na 1 diff --git a/others/img/cqlbtdmdfu.png b/others/img/cqlbtdmdfu.png new file mode 100644 index 00000000..1dc68b7c Binary files /dev/null and b/others/img/cqlbtdmdfu.png differ diff --git a/others/img/djbwv.png b/others/img/djbwv.png new file mode 100644 index 00000000..12ec2fae Binary files /dev/null and b/others/img/djbwv.png differ diff --git a/others/img/gdyu.png b/others/img/gdyu.png new file mode 100644 index 00000000..3dbbf452 Binary files /dev/null and b/others/img/gdyu.png differ diff --git a/others/img/kudsvqw.png b/others/img/kudsvqw.png new file mode 100644 index 00000000..4b3d6193 Binary files /dev/null and b/others/img/kudsvqw.png differ diff --git a/others/img/znjldkd.png b/others/img/znjldkd.png new file mode 100644 index 00000000..5fcc15d4 Binary files /dev/null and b/others/img/znjldkd.png differ diff --git "a/others/img/\345\210\232\344\272\244\347\232\204\345\245\275\346\234\213\345\217\213.png" "b/others/img/\345\210\232\344\272\244\347\232\204\345\245\275\346\234\213\345\217\213.png" new file mode 100644 index 00000000..15df7a5b Binary files /dev/null and "b/others/img/\345\210\232\344\272\244\347\232\204\345\245\275\346\234\213\345\217\213.png" differ diff --git "a/others/img/\345\210\232\344\272\244\347\232\204\345\245\275\346\234\213\345\217\2132.png" "b/others/img/\345\210\232\344\272\244\347\232\204\345\245\275\346\234\213\345\217\2132.png" new file mode 100644 index 00000000..dcb05bdd Binary files /dev/null and "b/others/img/\345\210\232\344\272\244\347\232\204\345\245\275\346\234\213\345\217\2132.png" differ diff --git "a/others/img/\345\210\232\344\272\244\347\232\204\346\234\213\345\217\213.png" "b/others/img/\345\210\232\344\272\244\347\232\204\346\234\213\345\217\213.png" new file mode 100644 index 00000000..e05c4414 Binary files /dev/null and "b/others/img/\345\210\232\344\272\244\347\232\204\346\234\213\345\217\213.png" differ diff --git a/others/program/arpa/arpa.py b/others/program/arpa/arpa.py new file mode 100644 index 00000000..bd46e1c0 --- /dev/null +++ b/others/program/arpa/arpa.py @@ -0,0 +1,68 @@ +# 脚本来自RIME输入法交流小群-雨辰 +import re +import math +import os +import string + +def extract_ngram_counts(arpa_file): + ngrams_counts = {} + with open(arpa_file, 'r', encoding='utf-8') as file: + for line in file: + line = line.strip() + if line.startswith("ngram"): + parts = line.split('=') + if len(parts) == 2 and parts[0].startswith("ngram"): + order = int(parts[0].split()[1]) + count = int(parts[1]) + ngrams_counts[order] = count + elif line.startswith("\\1-grams:"): # Stop after reading counts + break + return ngrams_counts + +def extract_ngrams(arpa_file): + ngram_line_pattern = re.compile(r"^(-?\d+\.\d+)\t(.+?)(?:\t-?\d+\.\d+)?$") + with open(arpa_file, 'r', encoding='utf-8') as file: + current_order = 0 + for line in file: + line = line.strip() + if line.startswith("\\") and "-grams:" in line: + current_order = int(line.split('-')[0][1:]) + continue + + ngram_line_match = ngram_line_pattern.match(line) + if ngram_line_match: + logprob, ngram = ngram_line_match.groups() + prob = math.exp(float(logprob)) + #prob = math.pow(10,abs(float(logprob))) + yield current_order, (ngram.strip(), prob) + +def write_frequencies_to_file(ngrams_generator, ngrams_counts, filename_pattern): + current_order = 0 + file = None + for order, ngram_data in ngrams_generator: + if order != current_order: + if file: + file.close() + current_order = order + filename = filename_pattern.format(order) + filename = os.path.join('cn_dicts_dazhu', filename) + file = open(filename, 'w', encoding='utf-8') + print(f"Writing {current_order}-grams to {filename}") + + ngram, prob = ngram_data + total_count = ngrams_counts.get(order, 1) + freq = round(prob * total_count) + file.write(f"{ngram}\t{freq}\n") + + if file: + file.close() + +# Update the path to your ARPA file +arpa_file_path = os.path.join('cn_dicts_dazhu', "zhi0709.arpa") + +# Extract n-grams counts +ngrams_counts = extract_ngram_counts(arpa_file_path) + +# Extract n-grams and write frequencies to files +ngrams_generator = extract_ngrams(arpa_file_path) +write_frequencies_to_file(ngrams_generator, ngrams_counts, "ngram_{}_frequencies.txt") \ No newline at end of file