From 841aa51dffef8eba55a9ed406554f13e07493c64 Mon Sep 17 00:00:00 2001
From: gaboolic <23441099@qq.com>
Date: Mon, 1 Jul 2024 18:03:22 +0800
Subject: [PATCH] =?UTF-8?q?=E5=A4=84=E7=90=86=E8=84=9A=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 others/program/deal_ice_dict_to_dazhu.py | 13 ++++++
 others/program/mnbvc/get_dict.py         | 54 ++++++++++++------------
 others/program/mnbvc/yuliao_fenci.py     |  2 +-
 3 files changed, 42 insertions(+), 27 deletions(-)
 create mode 100644 others/program/deal_ice_dict_to_dazhu.py

diff --git a/others/program/deal_ice_dict_to_dazhu.py b/others/program/deal_ice_dict_to_dazhu.py
new file mode 100644
index 00000000..f868dce0
--- /dev/null
+++ b/others/program/deal_ice_dict_to_dazhu.py
@@ -0,0 +1,13 @@
+import os
+import re
+import string
+import json
+
+read_file = open(os.path.join('cn_dicts_dazhu', "zhihu_deal_sort_merge.txt"), 'r')
+write_file = open(os.path.join('cn_dicts_dazhu', "知频.txt"), 'w')
+# 逐行读取文件内容
+for line in read_file:
+    line = line.strip()
+    params = line.split("\t")
+
+    write_file.write(f"{params[1]}\t{params[0]}\n")
diff --git a/others/program/mnbvc/get_dict.py b/others/program/mnbvc/get_dict.py
index 5d1e791f..6f8f11c6 100644
--- a/others/program/mnbvc/get_dict.py
+++ b/others/program/mnbvc/get_dict.py
@@ -8,31 +8,33 @@
 # file_name = os.path.join(os.path.expanduser("~/Downloads"),'liwu_253874_com.jsonl') # 里屋论坛
 # file_name = os.path.join(os.path.expanduser("~/Downloads"),'46.jsonl') #维基
 # file_name = os.path.join(os.path.expanduser("~/Downloads"),'oscar_202201.part_0075.jsonl') # 通用文本
-file_name = os.path.join(os.path.expanduser("~/mnbvc"),'5.jsonl') # 知乎 https://huggingface.co/datasets/liwu/MNBVC/tree/main/qa/20230196/zhihu
-write_file_name = os.path.join('cn_dicts_dazhu', "zhihu_deal5.txt")
-write_file = open(write_file_name, 'w')
-line_count = 0
-with open(file_name, 'r') as file:
-    # 逐行读取文件内容
-    for line in file:
-        line_count += 1
-        line = file.readline()
-        line = line.strip()
-        if len(line) == 0:
-            continue
 
-        try:
-            data = json.loads(line)
-        except json.decoder.JSONDecodeError:
-            print("JSONDecodeError")
-            print(line)
-            continue
-        
-        q_content = data['问']
-        a_content = data['答']
-        write_file.write(q_content)
-        write_file.write("\n")
-        write_file.write(a_content)
-        write_file.write("\n")
-    print(f"line_count {line_count}")
+
+for i in range(0,5):
+    file_name = os.path.join(os.path.expanduser("~/mnbvc"),f'{i}.jsonl') # 知乎 https://huggingface.co/datasets/liwu/MNBVC/tree/main/qa/20230196/zhihu
+    write_file_name = os.path.join('cn_dicts_dazhu', "zhihu_deal{i}.txt")
+    write_file = open(write_file_name, 'w')
+    line_count = 0
+    with open(file_name, 'r') as file:
+        # 逐行读取文件内容
+        for line in file:
+            line_count += 1
+            line = line.strip()
+            if len(line) == 0:
+                continue
+
+            try:
+                data = json.loads(line)
+            except json.decoder.JSONDecodeError:
+                print("JSONDecodeError")
+                print(line)
+                continue
+            
+            q_content = data['问']
+            a_content = data['答']
+            write_file.write(q_content)
+            write_file.write("\n")
+            write_file.write(a_content)
+            write_file.write("\n")
+        print(f"line_count {line_count}")
         
diff --git a/others/program/mnbvc/yuliao_fenci.py b/others/program/mnbvc/yuliao_fenci.py
index 2c5170e8..fa667a0e 100644
--- a/others/program/mnbvc/yuliao_fenci.py
+++ b/others/program/mnbvc/yuliao_fenci.py
@@ -22,7 +22,7 @@ def match_chinese(text):
 seg_list = jieba.cut_for_search("耙耙柑")
 print("搜索引擎模式: " + "/ ".join(seg_list))
 
-for i in range(1,5):
+for i in range(0,5):
     print(i)
     read_file = open(f"cn_dicts_dazhu/zhihu_deal{i}.txt", 'r', encoding='utf-8')
     word_map = {}