使用745396750字的高质量语料，进行分词，重新统计字频、词频，归一化

* 修改注音 * 分词 * 计算方法修改 * 分词 * 1 * 分词 * 更新日志 * 多音字 * 多音字 * 生成字频词频 * 使用知乎频率重做词频 * 1 * 1 * 词频 * 1 * 1 * # todo 按音节、按频率排序 * 维基词频 * 维基词频 * 修词频 * 字表排序 * 删词 * 更新多音字 * 删词加词 * 加词 * 词频745396750字 * 词频745396750字 * 字频 * 更新readme * readme * 棒杀 * 删词 * 更新readme * 1 * 更新readme
gaboolic · Jul 10, 2024 · cfdca5b · cfdca5b
1 parent b4be9d7
commit cfdca5b
Show file tree

Hide file tree

Showing 12 changed files with 83 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -19,6 +19,20 @@
 
 ![alt text](others/img/tushuguancangshu.png)
 
+![alt text](others/img/znjldkd.png)
+
+![alt text](others/img/kudsvqw.png)
+
+![alt text](others/img/cqlbtdmdfu.png)
+
+![alt text](others/img/djbwv.png)
+
+![alt text](others/img/刚交的朋友.png)
+
+![alt text](others/img/刚交的好朋友.png)
+
+![alt text](others/img/刚交的好朋友2.png)
+
 后续todo：
 
 拆分细胞词库，加上长尾词，重新分词

diff --git a/cn_dicts/base.dict.yaml b/cn_dicts/base.dict.yaml
@@ -9820,6 +9820,7 @@ sort: by_weight
 帮人	bang ren	811
 邦瑞	bang rui	1
 榜三	bang san	6
+棒杀	bang sha	40
 谤讪	bang shan	4
 榜上	bang shang	499
 帮上	bang shang	227

diff --git a/cn_dicts/ext.dict.yaml b/cn_dicts/ext.dict.yaml
@@ -90295,7 +90295,6 @@ sort: by_weight
 老作品	lao zuo pin	93
 勒贝尔	le bei er	14
 乐播投屏	le bo tou ping	94
-了不成	le bu cheng	43
 乐不可极	le bu ke ji	2
 乐不可言	le bu ke yan	1
 勒布朗队	le bu lang dui	2
@@ -90336,7 +90335,6 @@ sort: by_weight
 乐高玩具	le gao wan ju	50
 乐高星球大战	le gao xing qiu da zhan	12
 乐高游戏	le gao you xi	2
-了个寂寞	le ge ji mo	298
 乐购超市	le gou chao shi	3
 乐观程度	le guan cheng du	8
 乐观大方	le guan da fang	2
@@ -90391,17 +90389,13 @@ sort: by_weight
 乐呵呵地	le he he de	107
 乐呵呵地说	le he he de shuo	8
 乐和乐和	le he le he	2
-了很多	le hen duo	9945
 乐乎公寓	le hu gong yu	3
 乐华七子	le hua qi zi	8
 乐极悲生	le ji bei sheng	5
-了几分	le ji fen	229
 乐极忘形	le ji wang xing	1
 乐极则悲	le ji ze bei	2
 乐嘉老师	le jia lao shi	2
 乐见其成	le jian qi cheng	100
-了就好	le jiu hao	68
-了就是好	le jiu shi hao	8
 乐居房产	le ju fang chan	1
 勒卡雷	le ka lei	11
 乐开了花	le kai le hua	122
@@ -90425,7 +90419,6 @@ sort: by_weight
 勒芒冠军	le mang guan jun	17
 勒梅尔	le mei er	2
 勒梅特	le mei te	24
-了没有啊	le mei you a	3
 乐佩公主	le pei gong zhu	3
 乐平公主	le ping gong zhu	2
 乐起来	le qi lai	7
@@ -90525,21 +90518,13 @@ sort: by_weight
 乐享健康	le xiang jian kang	26
 乐享其中	le xiang qi zhong	1
 乐享生活	le xiang sheng huo	6
-了新的	le xin de	929
-了一遍	le yi bian	2349
 乐意不乐意	le yi bu le yi	8
-了一场	le yi chang	580
 乐意分享	le yi fen xiang	48
 乐意奉陪	le yi feng pei	7
-了一会	le yi hui	544
-了一会儿	le yi hui er	539
 乐意接受	le yi jie shou	76
 乐意来	le yi lai	5
 乐意吗	le yi ma	14
 乐意配合	le yi pei he	4
-了一声	le yi sheng	789
-了一下	le yi xia	13631
-了一些	le yi xie	6893
 乐意之至	le yi zhi zhi	6
 乐游记	le you ji	2
 乐于帮助	le yu bang zhu	46
@@ -90565,10 +90550,8 @@ sort: by_weight
 乐园内	le yuan nei	4
 乐园游玩	le yuan you wan	2
 乐园中	le yuan zhong	18
-了这个	le zhe ge	3144
 乐中乐	le zhong le	2
 乐卓博大学	le zhuo bo da xue	1
-了自己	le zi ji	3584
 乐子人	le zi ren	117
 乐滋滋地	le zi zi de	7
 雷安娜	lei an na	1

diff --git a/others/img/cqlbtdmdfu.png b/others/img/cqlbtdmdfu.png
diff --git a/others/img/djbwv.png b/others/img/djbwv.png
diff --git a/others/img/gdyu.png b/others/img/gdyu.png
diff --git a/others/img/kudsvqw.png b/others/img/kudsvqw.png
diff --git a/others/img/znjldkd.png b/others/img/znjldkd.png
diff --git a/others/img/刚交的好朋友.png b/others/img/刚交的好朋友.png
diff --git a/others/img/刚交的好朋友2.png b/others/img/刚交的好朋友2.png
diff --git a/others/img/刚交的朋友.png b/others/img/刚交的朋友.png
diff --git a/others/program/arpa/arpa.py b/others/program/arpa/arpa.py
@@ -0,0 +1,68 @@
+# 脚本来自RIME输入法交流小群-雨辰
+import re
+import math
+import os
+import string
+
+def extract_ngram_counts(arpa_file):
+    ngrams_counts = {}
+    with open(arpa_file, 'r', encoding='utf-8') as file:
+        for line in file:
+            line = line.strip()
+            if line.startswith("ngram"):
+                parts = line.split('=')
+                if len(parts) == 2 and parts[0].startswith("ngram"):
+                    order = int(parts[0].split()[1])
+                    count = int(parts[1])
+                    ngrams_counts[order] = count
+            elif line.startswith("\\1-grams:"):  # Stop after reading counts
+                break
+    return ngrams_counts
+
+def extract_ngrams(arpa_file):
+    ngram_line_pattern = re.compile(r"^(-?\d+\.\d+)\t(.+?)(?:\t-?\d+\.\d+)?$")
+    with open(arpa_file, 'r', encoding='utf-8') as file:
+        current_order = 0
+        for line in file:
+            line = line.strip()
+            if line.startswith("\\") and "-grams:" in line:
+                current_order = int(line.split('-')[0][1:])
+                continue
+
+            ngram_line_match = ngram_line_pattern.match(line)
+            if ngram_line_match:
+                logprob, ngram = ngram_line_match.groups()
+                prob = math.exp(float(logprob))
+                #prob = math.pow(10,abs(float(logprob)))
+                yield current_order, (ngram.strip(), prob)
+
+def write_frequencies_to_file(ngrams_generator, ngrams_counts, filename_pattern):
+    current_order = 0
+    file = None
+    for order, ngram_data in ngrams_generator:
+        if order != current_order:
+            if file:
+                file.close()
+            current_order = order
+            filename = filename_pattern.format(order)
+            filename = os.path.join('cn_dicts_dazhu', filename)
+            file = open(filename, 'w', encoding='utf-8')
+            print(f"Writing {current_order}-grams to {filename}")
+
+        ngram, prob = ngram_data
+        total_count = ngrams_counts.get(order, 1)
+        freq = round(prob * total_count)
+        file.write(f"{ngram}\t{freq}\n")
+
+    if file:
+        file.close()
+
+# Update the path to your ARPA file
+arpa_file_path = os.path.join('cn_dicts_dazhu', "zhi0709.arpa")
+
+# Extract n-grams counts
+ngrams_counts = extract_ngram_counts(arpa_file_path)
+
+# Extract n-grams and write frequencies to files
+ngrams_generator = extract_ngrams(arpa_file_path)
+write_frequencies_to_file(ngrams_generator, ngrams_counts, "ngram_{}_frequencies.txt")