From 841aa51dffef8eba55a9ed406554f13e07493c64 Mon Sep 17 00:00:00 2001 From: gaboolic <23441099@qq.com> Date: Mon, 1 Jul 2024 18:03:22 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A4=84=E7=90=86=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- others/program/deal_ice_dict_to_dazhu.py | 13 ++++++ others/program/mnbvc/get_dict.py | 54 ++++++++++++------------ others/program/mnbvc/yuliao_fenci.py | 2 +- 3 files changed, 42 insertions(+), 27 deletions(-) create mode 100644 others/program/deal_ice_dict_to_dazhu.py diff --git a/others/program/deal_ice_dict_to_dazhu.py b/others/program/deal_ice_dict_to_dazhu.py new file mode 100644 index 00000000..f868dce0 --- /dev/null +++ b/others/program/deal_ice_dict_to_dazhu.py @@ -0,0 +1,13 @@ +import os +import re +import string +import json + +read_file = open(os.path.join('cn_dicts_dazhu', "zhihu_deal_sort_merge.txt"), 'r') +write_file = open(os.path.join('cn_dicts_dazhu', "知频.txt"), 'w') +# 逐行读取文件内容 +for line in read_file: + line = line.strip() + params = line.split("\t") + + write_file.write(f"{params[1]}\t{params[0]}\n") diff --git a/others/program/mnbvc/get_dict.py b/others/program/mnbvc/get_dict.py index 5d1e791f..6f8f11c6 100644 --- a/others/program/mnbvc/get_dict.py +++ b/others/program/mnbvc/get_dict.py @@ -8,31 +8,33 @@ # file_name = os.path.join(os.path.expanduser("~/Downloads"),'liwu_253874_com.jsonl') # 里屋论坛 # file_name = os.path.join(os.path.expanduser("~/Downloads"),'46.jsonl') #维基 # file_name = os.path.join(os.path.expanduser("~/Downloads"),'oscar_202201.part_0075.jsonl') # 通用文本 -file_name = os.path.join(os.path.expanduser("~/mnbvc"),'5.jsonl') # 知乎 https://huggingface.co/datasets/liwu/MNBVC/tree/main/qa/20230196/zhihu -write_file_name = os.path.join('cn_dicts_dazhu', "zhihu_deal5.txt") -write_file = open(write_file_name, 'w') -line_count = 0 -with open(file_name, 'r') as file: - # 逐行读取文件内容 - for line in file: - line_count += 1 - line = file.readline() - line = line.strip() - if len(line) == 0: - continue - try: - data = json.loads(line) - except json.decoder.JSONDecodeError: - print("JSONDecodeError") - print(line) - continue - - q_content = data['问'] - a_content = data['答'] - write_file.write(q_content) - write_file.write("\n") - write_file.write(a_content) - write_file.write("\n") - print(f"line_count {line_count}") + +for i in range(0,5): + file_name = os.path.join(os.path.expanduser("~/mnbvc"),f'{i}.jsonl') # 知乎 https://huggingface.co/datasets/liwu/MNBVC/tree/main/qa/20230196/zhihu + write_file_name = os.path.join('cn_dicts_dazhu', "zhihu_deal{i}.txt") + write_file = open(write_file_name, 'w') + line_count = 0 + with open(file_name, 'r') as file: + # 逐行读取文件内容 + for line in file: + line_count += 1 + line = line.strip() + if len(line) == 0: + continue + + try: + data = json.loads(line) + except json.decoder.JSONDecodeError: + print("JSONDecodeError") + print(line) + continue + + q_content = data['问'] + a_content = data['答'] + write_file.write(q_content) + write_file.write("\n") + write_file.write(a_content) + write_file.write("\n") + print(f"line_count {line_count}") diff --git a/others/program/mnbvc/yuliao_fenci.py b/others/program/mnbvc/yuliao_fenci.py index 2c5170e8..fa667a0e 100644 --- a/others/program/mnbvc/yuliao_fenci.py +++ b/others/program/mnbvc/yuliao_fenci.py @@ -22,7 +22,7 @@ def match_chinese(text): seg_list = jieba.cut_for_search("耙耙柑") print("搜索引擎模式: " + "/ ".join(seg_list)) -for i in range(1,5): +for i in range(0,5): print(i) read_file = open(f"cn_dicts_dazhu/zhihu_deal{i}.txt", 'r', encoding='utf-8') word_map = {}