-
Notifications
You must be signed in to change notification settings - Fork 3
/
sjk_content.py
157 lines (137 loc) · 4.24 KB
/
sjk_content.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# -*- encoding: utf-8 -*-
'''
@File : sjk_content.py
@Time : 2022/04/21 11:16:09
@Author : Coder-Sakura
@Version : 1.0
@Desc : None
'''
# here put the import lib
import os
import re
import time
import random
from docx import Document
# 居中
from docx.enum.text import WD_ALIGN_PARAGRAPH
# (正文)修改字体
from docx.oxml.ns import qn
from docx.shared import Pt, Cm, RGBColor
from sjk_tool import logger, network_connect
class SJK_CONTENT:
def remove_label(self, text_content):
"""
过滤文本
:params text_content:待替换文本列表
:return :list
"""
# 取出图片链接
img_regular = re.compile(r""".*?src=['"](.*?)['"].*?""")
# 去除转义字符,\n \a \t
escape_regular = re.compile(r"[\a\b\f\n\t\r\v\\]")
# 去除html标签
html_regular = re.compile(r"<[^>]+>", re.S)
# 去除utf编码保存文件导致出现的\ufeff
bom_regular = re.compile(u"[\ufeff]+", re.S)
res = []
for text in text_content:
# 匹配src链接
if "src" in text and "img" in text:
img = re.findall(img_regular, text.replace(" ", ""))
for i in img:
res.append(i)
# 去除转义字符
text = re.sub(escape_regular, "", text)
# 去除html标签
text = re.sub(html_regular, "", text)
# 去除utf编码保存文件导致出现的\ufeff
text = re.sub(bom_regular, "", text)
if text != "":
res.append(text)
return res
def find_size(self, index):
size = []
size_regular = re.compile(r""".*?width=['"](.*?)['"].*?height=['"](.*?)['"].*?""")
try:
size = re.findall(size_regular, self.text_content[index])
except:
return []
else:
if not size:
return []
# width height [('128', '36')] or []
return size
def get_img(self, url):
img_path = os.path.join(
os.path.dirname(self.docx_path),
f"{str(int(time.time()))}-{str(random.choice(range(100)))}.jpg"
)
try:
resp = network_connect(url)
with open(img_path, "wb") as f:
f.write(resp.content)
except:
return None
return img_path
def write_docx(self, section_name, docx_path, new_text_content):
"""
章节文档写入docx
:params docx_path: docx存储路径
:params new_text_content: 过滤过的内容
:return : None
"""
document = Document()
# 添加标题
head = document.add_heading(level=1)
head.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = head.add_run(section_name)
run.font.name = u"微软雅黑"
run.font.color.rgb = RGBColor(0,0,0)
run._element.rPr.rFonts.set(qn('w:eastAsia'), u'微软雅黑')
# 正文样式
document.styles['Normal'].font.name = u'微软雅黑'
document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'微软雅黑')
for k, _ in enumerate(new_text_content):
# 插入图片
if (_.startswith("http://") or _.startswith("https://")) and \
any([".jpg" in _, ".png" in _, ".jpeg" in _]):
try:
img_path = self.get_img(_)
except Exception as e:
logger.warning(f"下载文档图片链接出错 - {_} - {e}")
if img_path:
try:
shape = document.add_picture(img_path)
except Exception as e:
logger.warning(f"向docx内添加图片出错... - {_} - {e}")
paragraph = document.add_paragraph(_, 'Normal')
paragraph.paragraph_format.space_before = Pt(20)
else:
size = self.find_size(k)
if size:
pass
# shape.width, shape.height = int(size[0][0]), int(size[0][1])
else:
raw_height, raw_width = shape.height, shape.width
shape.height = Cm(3)
shape.width = int(raw_width * shape.height / raw_height)
os.remove(img_path)
else:
paragraph = document.add_paragraph(_, 'Normal')
paragraph.paragraph_format.space_before = Pt(20)
else:
paragraph = document.add_paragraph(_, 'Normal')
paragraph.paragraph_format.space_before = Pt(20)
document.save(docx_path)
logger.success(f"文档下载成功: {docx_path}")
time.sleep(1)
def main(self, section_name, docx_path, text_content):
self.docx_path = docx_path
self.text_content = text_content
# 先正则过滤,再写入
# 不存在则进行SC流程
if os.path.exists(docx_path):
logger.success(f"已存在: <docx_path> {docx_path}")
return
new_text_content = self.remove_label(text_content)
self.write_docx(section_name, docx_path, new_text_content)