Skip to content

Commit

Permalink
词频统计
Browse files Browse the repository at this point in the history
  • Loading branch information
gaboolic committed Jul 1, 2024
1 parent 3e322a6 commit 5d36140
Show file tree
Hide file tree
Showing 3 changed files with 1,140,834 additions and 120,050 deletions.
34 changes: 34 additions & 0 deletions others/program/mnbvc/merge_fenci_freq_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os
import re
import math
import re

import jieba

def match_chinese(text):
# 定义正则表达式模式匹配中文字符
pattern = re.compile("[\u4e00-\u9fa5]{1}") # 匹配连续两个中文字符
return re.findall(pattern, text)

word_map = {}
for i in range(0,5):
read_file = open(f"cn_dicts_dazhu/zhihu_deal_sort{i}.txt", 'r', encoding='utf-8')
for line in read_file:
line = line.strip()
params = line.split("\t")
word = params[0]
freq = int(params[1])

if word in word_map:
word_map[word] += freq
else:
word_map[word] = freq

# 对word_map按值进行排序
sorted_word_map = sorted(word_map.items(), key=lambda x: x[1], reverse=True)

# 遍历排序后的结果
write_file = open(f"cn_dicts_dazhu/zhihu_deal_sort_merge.txt", 'w', encoding='utf-8')
for item in sorted_word_map:
if match_chinese(item[0]):
write_file.write(f"{item[0]}\t{item[1]}\n")
58 changes: 34 additions & 24 deletions others/program/mnbvc/yuliao_fenci.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
import os
import re
import math
import re

import jieba

def match_chinese(text):
# 定义正则表达式模式匹配中文字符
pattern = re.compile("[\u4e00-\u9fa5]{1}") # 匹配连续两个中文字符
return re.findall(pattern, text)

# 精确模式分词
seg_list = jieba.cut("耙耙柑", cut_all=False)
print("精确模式: " + "/ ".join(seg_list))
Expand All @@ -16,27 +22,31 @@
seg_list = jieba.cut_for_search("耙耙柑")
print("搜索引擎模式: " + "/ ".join(seg_list))

read_file = open("cn_dicts_dazhu/zhihu_deal.txt", 'r', encoding='utf-8')
word_map = {}
deal_count = 0
for line in read_file:
line = line.strip()
seg_list = jieba.cut(line, cut_all=False)
for seg in seg_list:
if seg in word_map:
word_map[seg] += 1
else:
word_map[seg] = 1
deal_count += 1
if deal_count % 1000 == 0:
print(f"当前处理数量{deal_count}")

print("词频统计完成")
print(len(word_map))
# 对word_map按值进行排序
sorted_word_map = sorted(word_map.items(), key=lambda x: x[1], reverse=True)

# 遍历排序后的结果
write_file = open("cn_dicts_dazhu/zhihu_deal_sort.txt", 'w', encoding='utf-8')
for item in sorted_word_map:
write_file.write(f"{item[0]}\t{item[1]}\n")
for i in range(1,5):
print(i)
read_file = open(f"cn_dicts_dazhu/zhihu_deal{i}.txt", 'r', encoding='utf-8')
word_map = {}
deal_count = 0
for line in read_file:
line = line.strip()
seg_list = jieba.cut(line, cut_all=False)
for seg in seg_list:
if seg in word_map:
word_map[seg] += 1
else:
word_map[seg] = 1
deal_count += 1
if deal_count % 1000 == 0:
print(f"当前处理数量{deal_count}")
#break

print("词频统计完成")
print(len(word_map))
# 对word_map按值进行排序
sorted_word_map = sorted(word_map.items(), key=lambda x: x[1], reverse=True)

# 遍历排序后的结果
write_file = open(f"cn_dicts_dazhu/zhihu_deal_sort{i}.txt", 'w', encoding='utf-8')
for item in sorted_word_map:
if match_chinese(item[0]):
write_file.write(f"{item[0]}\t{item[1]}\n")
Loading

0 comments on commit 5d36140

Please sign in to comment.