diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/parser/comment_parser.py index 6e06c776..c0117d80 100644 --- a/weibo_spider/parser/comment_parser.py +++ b/weibo_spider/parser/comment_parser.py @@ -33,7 +33,7 @@ def get_long_weibo(self): # 3. 去掉所有 HTML 标签,但保留标签内的有效文本 new_content = fromstring(html_string).text_content() # 4. 替换多个连续的 \n 为一个 \n - new_content = re.sub(r'\n+', '\n', new_content) + new_content = re.sub(r'\n+\s*', '\n', new_content) weibo_content = handle_garbled(new_content) if weibo_content is not None: return weibo_content diff --git a/weibo_spider/writer/post_writer.py b/weibo_spider/writer/post_writer.py index 7446fbea..af536623 100644 --- a/weibo_spider/writer/post_writer.py +++ b/weibo_spider/writer/post_writer.py @@ -5,6 +5,8 @@ import requests from .writer import Writer +from time import sleep +from requests.exceptions import RequestException logger = logging.getLogger('spider.post_writer')