From c0c75257a488f9448870d6b4821f217ca4560ec2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BA=8C=E4=B9=94?= <605056080@qq.com> Date: Sun, 28 Apr 2024 17:04:19 +0800 Subject: [PATCH] =?UTF-8?q?issues=5Ffeature=5Fpost=5Fapi=5F576=20=E5=AE=9E?= =?UTF-8?q?=E7=8E=B0=E9=80=9A=E8=BF=87POST=E6=96=B9=E5=BC=8F=E5=B0=86?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E6=8E=A8=E9=80=81=E5=88=B0=E8=87=AA=E5=AE=9A?= =?UTF-8?q?=E4=B9=89=E6=8E=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/parser/comment_parser.py | 2 +- weibo_spider/writer/post_writer.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/parser/comment_parser.py index 6e06c776..c0117d80 100644 --- a/weibo_spider/parser/comment_parser.py +++ b/weibo_spider/parser/comment_parser.py @@ -33,7 +33,7 @@ def get_long_weibo(self): # 3. 去掉所有 HTML 标签,但保留标签内的有效文本 new_content = fromstring(html_string).text_content() # 4. 替换多个连续的 \n 为一个 \n - new_content = re.sub(r'\n+', '\n', new_content) + new_content = re.sub(r'\n+\s*', '\n', new_content) weibo_content = handle_garbled(new_content) if weibo_content is not None: return weibo_content diff --git a/weibo_spider/writer/post_writer.py b/weibo_spider/writer/post_writer.py index 7446fbea..af536623 100644 --- a/weibo_spider/writer/post_writer.py +++ b/weibo_spider/writer/post_writer.py @@ -5,6 +5,8 @@ import requests from .writer import Writer +from time import sleep +from requests.exceptions import RequestException logger = logging.getLogger('spider.post_writer')