处理脚本

gaboolic · Jul 1, 2024 · 841aa51 · 841aa51
1 parent 5d36140
commit 841aa51
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 27 deletions.
diff --git a/others/program/deal_ice_dict_to_dazhu.py b/others/program/deal_ice_dict_to_dazhu.py
@@ -0,0 +1,13 @@
+import os
+import re
+import string
+import json
+
+read_file = open(os.path.join('cn_dicts_dazhu', "zhihu_deal_sort_merge.txt"), 'r')
+write_file = open(os.path.join('cn_dicts_dazhu', "知频.txt"), 'w')
+# 逐行读取文件内容
+for line in read_file:
+    line = line.strip()
+    params = line.split("\t")
+
+    write_file.write(f"{params[1]}\t{params[0]}\n")
diff --git a/others/program/mnbvc/get_dict.py b/others/program/mnbvc/get_dict.py
@@ -8,31 +8,33 @@
 # file_name = os.path.join(os.path.expanduser("~/Downloads"),'liwu_253874_com.jsonl') # 里屋论坛
 # file_name = os.path.join(os.path.expanduser("~/Downloads"),'46.jsonl') #维基
 # file_name = os.path.join(os.path.expanduser("~/Downloads"),'oscar_202201.part_0075.jsonl') # 通用文本
-file_name = os.path.join(os.path.expanduser("~/mnbvc"),'5.jsonl') # 知乎 https://huggingface.co/datasets/liwu/MNBVC/tree/main/qa/20230196/zhihu
-write_file_name = os.path.join('cn_dicts_dazhu', "zhihu_deal5.txt")
-write_file = open(write_file_name, 'w')
-line_count = 0
-with open(file_name, 'r') as file:
-    # 逐行读取文件内容
-    for line in file:
-        line_count += 1
-        line = file.readline()
-        line = line.strip()
-        if len(line) == 0:
-            continue
 
-        try:
-            data = json.loads(line)
-        except json.decoder.JSONDecodeError:
-            print("JSONDecodeError")
-            print(line)
-            continue
-
-        q_content = data['问']
-        a_content = data['答']
-        write_file.write(q_content)
-        write_file.write("\n")
-        write_file.write(a_content)
-        write_file.write("\n")
-    print(f"line_count {line_count}")
+
+for i in range(0,5):
+    file_name = os.path.join(os.path.expanduser("~/mnbvc"),f'{i}.jsonl') # 知乎 https://huggingface.co/datasets/liwu/MNBVC/tree/main/qa/20230196/zhihu
+    write_file_name = os.path.join('cn_dicts_dazhu', "zhihu_deal{i}.txt")
+    write_file = open(write_file_name, 'w')
+    line_count = 0
+    with open(file_name, 'r') as file:
+        # 逐行读取文件内容
+        for line in file:
+            line_count += 1
+            line = line.strip()
+            if len(line) == 0:
+                continue
+
+            try:
+                data = json.loads(line)
+            except json.decoder.JSONDecodeError:
+                print("JSONDecodeError")
+                print(line)
+                continue
+
+            q_content = data['问']
+            a_content = data['答']
+            write_file.write(q_content)
+            write_file.write("\n")
+            write_file.write(a_content)
+            write_file.write("\n")
+        print(f"line_count {line_count}")
 
diff --git a/others/program/mnbvc/yuliao_fenci.py b/others/program/mnbvc/yuliao_fenci.py
@@ -22,7 +22,7 @@ def match_chinese(text):
 seg_list = jieba.cut_for_search("耙耙柑")
 print("搜索引擎模式: " + "/ ".join(seg_list))
 
-for i in range(1,5):
+for i in range(0,5):
     print(i)
     read_file = open(f"cn_dicts_dazhu/zhihu_deal{i}.txt", 'r', encoding='utf-8')
     word_map = {}