diff --git a/tests/test_parser/test_comment_parser.py b/tests/test_parser/test_comment_parser.py
index 51c79374..72aa58c3 100644
--- a/tests/test_parser/test_comment_parser.py
+++ b/tests/test_parser/test_comment_parser.py
@@ -21,6 +21,6 @@ def test_comment_parser():
"""此时此刻,我们比以往更加重视与自然相处的方式,我们也从未如此迫切需要将想法付诸行动。"""
"""热巴已经和我们@北京绿色阳光 站在一起,希望看完视频的你们,也能获得同样感受与动力。"""
"""We Stand for Wildlife. 明日朝阳68309的优酷视频 \xa0"""
- """原文转发[1000000] \xa0原文评论[38688] 转发理由: 在羌塘的美好回忆~"""
+ """原文转发[1000000] \xa0原文评论[38688] 转发理由: 在羌塘的美好回忆~"""
"""第一次来到这片独特的荒野,看到野生动物自由生活,还有一群快乐可爱的人在守护着它们。"""
"""把这些美好留存下来,关注野生动物保护,积极行动,我们每个人都能贡献力量。 \xa0 """)
diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/parser/comment_parser.py
index c0117d80..6a409994 100644
--- a/weibo_spider/parser/comment_parser.py
+++ b/weibo_spider/parser/comment_parser.py
@@ -24,17 +24,30 @@ def get_long_weibo(self):
for i in range(5):
self.selector = handle_html(self.cookie, self.url)
if self.selector is not None:
- info_div = self.selector.xpath("//div[@class='c' and @id='M_']")[0]
+ parent_div = self.selector.xpath("//div[@class='c' and @id='M_']")[0]
+ info_div = parent_div[0]
+ detail_div = parent_div[1]
+ all_content = etree.Element("div")
+
+ # gets content within the first div starting at ctt
info_span = info_div.xpath("//span[@class='ctt']")[0]
+ for elem in info_div[info_div.index(info_span):]:
+ all_content.append(elem)
+
+ # gets all content in the second div
+ for elem in detail_div:
+ all_content.append(elem)
+
# 1. 获取 info_span 中的所有 HTML 代码作为字符串
- html_string = etree.tostring(info_span, encoding='unicode', method='html')
- # 2. 将
替换为 \n
- html_string = html_string.replace('
', '\n')
- # 3. 去掉所有 HTML 标签,但保留标签内的有效文本
+ html_string = etree.tostring(all_content, encoding='unicode', method='html')
+ # 2. 去掉所有 HTML 标签,但保留标签内的有效文本
new_content = fromstring(html_string).text_content()
- # 4. 替换多个连续的 \n 为一个 \n
+ # 3. 替换多个连续的 \n 为一个 \n
new_content = re.sub(r'\n+\s*', '\n', new_content)
+ #4. gets everything before the date
+ new_content = re.split(r'\d{2}月\d{2}日\s+\d{2}:\d{2}', new_content)[0]
weibo_content = handle_garbled(new_content)
+
if weibo_content is not None:
return weibo_content
sleep(random.randint(6, 10))