-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
76 lines (65 loc) · 2.54 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import logging
import json
import re
# 匹配并替换图片
def replace_images(md_content, img_dict):
def replacer(match):
img_path = match.group(2)
img_desc = match.group(1)
if img_path in img_dict:
replacement = img_dict[img_path]
if img_desc:
replacement = f"{replacement}\n\n*{img_desc}*"
return replacement
return match.group(0) # 如果图片不在字典中,保持原样
pattern = re.compile(r'!\[(.*?)\]\((.*?)\)')
return pattern.sub(replacer, md_content)
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
middle_path = r'C:\Users\liuch\Documents\00\hanrui_50W\middle.json'
md_path = r'C:\Users\liuch\Documents\00\hanrui_50W\NPD2308检验报告及检验记录.md'
new_md_path = r'C:\Users\liuch\Documents\00\hanrui_50W\NPD2308检验报告及检验记录_new.md'
base_path = r'C:\Users\liuch\Documents\00\hanrui_50W'
# 读取JSON文件
with open(middle_path, 'r', encoding='utf-8') as file:
data = json.load(file)
# 读取Markdown文件内容
with open(md_path, 'r', encoding='utf-8') as file:
content = file.read()
table_image_list = []
# 输出读取到的数据
for page_info in data['pdf_info']:
for page_block in page_info['preproc_blocks']:
if page_block['type'] == 'table':
for table_image in page_block['blocks']:
if table_image['type'] == 'table_body':
name = table_image['lines'][0]['spans'][0]['image_path']
table_image_list.append(name)
from vision import TableStructureRecognizer
from PIL import Image
from ui_deepdoc import get_table_data, get_instance
ocr = get_instance()
detr = TableStructureRecognizer()
ocr_pic_pil_image = []
ocr_pic_pil_image_name = []
result = {}
print(f"表格图片: {table_image_list}")
for i, pic in enumerate(table_image_list):
img = Image.open(base_path + "/images/" + pic)
thresholds, layouts = [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1], []
for threshold in thresholds:
layouts = detr([img], threshold)
if len(layouts) > 0:
break
logger.debug(threshold)
for lyt in layouts:
try:
table_data = get_table_data(img, lyt, ocr, True)
result["images/" + pic] = table_data
except Exception as e:
print('err:', e)
# print(result)
new_content = replace_images(content, result)
with open(new_md_path, 'w', encoding='utf-8') as file:
file.write(new_content)
print(f"处理后的Markdown文件已保存至 {new_md_path}")