-
Notifications
You must be signed in to change notification settings - Fork 69
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
使用745396750字的高质量语料,进行分词,重新统计字频、词频,归一化
* 修改注音 * 分词 * 计算方法修改 * 分词 * 1 * 分词 * 更新日志 * 多音字 * 多音字 * 生成字频词频 * 使用知乎频率重做词频 * 1 * 1 * 词频 * 1 * 1 * # todo 按音节、按频率排序 * 维基词频 * 维基词频 * 修词频 * 字表排序 * 删词 * 更新多音字 * 删词加词 * 加词 * 词频745396750字 * 词频745396750字 * 字频 * 更新readme * readme * 棒杀 * 删词 * 更新readme * 1 * 更新readme
- Loading branch information
Showing
12 changed files
with
83 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
# 脚本来自RIME输入法交流小群-雨辰 | ||
import re | ||
import math | ||
import os | ||
import string | ||
|
||
def extract_ngram_counts(arpa_file): | ||
ngrams_counts = {} | ||
with open(arpa_file, 'r', encoding='utf-8') as file: | ||
for line in file: | ||
line = line.strip() | ||
if line.startswith("ngram"): | ||
parts = line.split('=') | ||
if len(parts) == 2 and parts[0].startswith("ngram"): | ||
order = int(parts[0].split()[1]) | ||
count = int(parts[1]) | ||
ngrams_counts[order] = count | ||
elif line.startswith("\\1-grams:"): # Stop after reading counts | ||
break | ||
return ngrams_counts | ||
|
||
def extract_ngrams(arpa_file): | ||
ngram_line_pattern = re.compile(r"^(-?\d+\.\d+)\t(.+?)(?:\t-?\d+\.\d+)?$") | ||
with open(arpa_file, 'r', encoding='utf-8') as file: | ||
current_order = 0 | ||
for line in file: | ||
line = line.strip() | ||
if line.startswith("\\") and "-grams:" in line: | ||
current_order = int(line.split('-')[0][1:]) | ||
continue | ||
|
||
ngram_line_match = ngram_line_pattern.match(line) | ||
if ngram_line_match: | ||
logprob, ngram = ngram_line_match.groups() | ||
prob = math.exp(float(logprob)) | ||
#prob = math.pow(10,abs(float(logprob))) | ||
yield current_order, (ngram.strip(), prob) | ||
|
||
def write_frequencies_to_file(ngrams_generator, ngrams_counts, filename_pattern): | ||
current_order = 0 | ||
file = None | ||
for order, ngram_data in ngrams_generator: | ||
if order != current_order: | ||
if file: | ||
file.close() | ||
current_order = order | ||
filename = filename_pattern.format(order) | ||
filename = os.path.join('cn_dicts_dazhu', filename) | ||
file = open(filename, 'w', encoding='utf-8') | ||
print(f"Writing {current_order}-grams to {filename}") | ||
|
||
ngram, prob = ngram_data | ||
total_count = ngrams_counts.get(order, 1) | ||
freq = round(prob * total_count) | ||
file.write(f"{ngram}\t{freq}\n") | ||
|
||
if file: | ||
file.close() | ||
|
||
# Update the path to your ARPA file | ||
arpa_file_path = os.path.join('cn_dicts_dazhu', "zhi0709.arpa") | ||
|
||
# Extract n-grams counts | ||
ngrams_counts = extract_ngram_counts(arpa_file_path) | ||
|
||
# Extract n-grams and write frequencies to files | ||
ngrams_generator = extract_ngrams(arpa_file_path) | ||
write_frequencies_to_file(ngrams_generator, ngrams_counts, "ngram_{}_frequencies.txt") |