Skip to content

Commit

Permalink
处理脚本
Browse files Browse the repository at this point in the history
  • Loading branch information
gaboolic committed Jul 1, 2024
1 parent 5d36140 commit 841aa51
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 27 deletions.
13 changes: 13 additions & 0 deletions others/program/deal_ice_dict_to_dazhu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import os
import re
import string
import json

read_file = open(os.path.join('cn_dicts_dazhu', "zhihu_deal_sort_merge.txt"), 'r')
write_file = open(os.path.join('cn_dicts_dazhu', "知频.txt"), 'w')
# 逐行读取文件内容
for line in read_file:
line = line.strip()
params = line.split("\t")

write_file.write(f"{params[1]}\t{params[0]}\n")
54 changes: 28 additions & 26 deletions others/program/mnbvc/get_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,31 +8,33 @@
# file_name = os.path.join(os.path.expanduser("~/Downloads"),'liwu_253874_com.jsonl') # 里屋论坛
# file_name = os.path.join(os.path.expanduser("~/Downloads"),'46.jsonl') #维基
# file_name = os.path.join(os.path.expanduser("~/Downloads"),'oscar_202201.part_0075.jsonl') # 通用文本
file_name = os.path.join(os.path.expanduser("~/mnbvc"),'5.jsonl') # 知乎 https://huggingface.co/datasets/liwu/MNBVC/tree/main/qa/20230196/zhihu
write_file_name = os.path.join('cn_dicts_dazhu', "zhihu_deal5.txt")
write_file = open(write_file_name, 'w')
line_count = 0
with open(file_name, 'r') as file:
# 逐行读取文件内容
for line in file:
line_count += 1
line = file.readline()
line = line.strip()
if len(line) == 0:
continue

try:
data = json.loads(line)
except json.decoder.JSONDecodeError:
print("JSONDecodeError")
print(line)
continue

q_content = data['问']
a_content = data['答']
write_file.write(q_content)
write_file.write("\n")
write_file.write(a_content)
write_file.write("\n")
print(f"line_count {line_count}")

for i in range(0,5):
file_name = os.path.join(os.path.expanduser("~/mnbvc"),f'{i}.jsonl') # 知乎 https://huggingface.co/datasets/liwu/MNBVC/tree/main/qa/20230196/zhihu
write_file_name = os.path.join('cn_dicts_dazhu', "zhihu_deal{i}.txt")
write_file = open(write_file_name, 'w')
line_count = 0
with open(file_name, 'r') as file:
# 逐行读取文件内容
for line in file:
line_count += 1
line = line.strip()
if len(line) == 0:
continue

try:
data = json.loads(line)
except json.decoder.JSONDecodeError:
print("JSONDecodeError")
print(line)
continue

q_content = data['问']
a_content = data['答']
write_file.write(q_content)
write_file.write("\n")
write_file.write(a_content)
write_file.write("\n")
print(f"line_count {line_count}")

2 changes: 1 addition & 1 deletion others/program/mnbvc/yuliao_fenci.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def match_chinese(text):
seg_list = jieba.cut_for_search("耙耙柑")
print("搜索引擎模式: " + "/ ".join(seg_list))

for i in range(1,5):
for i in range(0,5):
print(i)
read_file = open(f"cn_dicts_dazhu/zhihu_deal{i}.txt", 'r', encoding='utf-8')
word_map = {}
Expand Down

0 comments on commit 841aa51

Please sign in to comment.