Skip to content

Commit d676cd2

Browse files
authored
Merge pull request #577 from myshero/feature-post-to-custom-api
issues_feature_post_api_576 实现通过POST方式将数据推送到自定义接口
2 parents d7de931 + c0c7525 commit d676cd2

File tree

7 files changed

+86
-4
lines changed

7 files changed

+86
-4
lines changed

docs/settings.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,3 +239,15 @@ MySQL和MongDB数据库的写入内容一样。程序首先会创建一个名为
239239
- **publish_tool**:存储微博的发布工具。
240240

241241
</details>
242+
243+
## 设置API接口POST联动(可选)
244+
245+
本部分是可选部分,如果不需要将爬取信息通过POST请求发送到指定API接口,可跳过这一步
246+
247+
请求数据格式为 `content-type : application/json`,接口响应返回也需要是 `content-type : application/json`,HTTP状态码为 `200`
248+
249+
数据主体与 `write_mode` 配置的 `json` 输出格式一致,是整页获取数据json,每页POST发送一次
250+
251+
`api_url` 为指定的API接口地址
252+
253+
`api_token` 为接口鉴权TOKEN,将在 Request Headers 中添加 `api-token` 字段,根据需要配置

weibo_spider/config_sample.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,5 +29,9 @@
2929
"connection_string": "mongodb://admin:password@localhost:27017/weibo",
3030
"dba_name": "",
3131
"dba_password": ""
32+
},
33+
"post_config": {
34+
"api_url": "",
35+
"api_token": ""
3236
}
3337
}

weibo_spider/config_util.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,14 +85,14 @@ def validate_config(config):
8585
sys.exit()
8686

8787
# 验证write_mode
88-
write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql', 'sqlite', 'kafka']
88+
write_mode = ['txt', 'csv', 'json', 'mongo', 'mysql', 'sqlite', 'kafka','post']
8989
if not isinstance(config['write_mode'], list):
9090
logger.warning(u'write_mode值应为list类型')
9191
sys.exit()
9292
for mode in config['write_mode']:
9393
if mode not in write_mode:
9494
logger.warning(
95-
u'%s为无效模式,请从txt、csv、json、mongo、sqlite, kafka和mysql中挑选一个或多个作为write_mode',
95+
u'%s为无效模式,请从txt、csv、json、post、mongo、sqlite, kafka和mysql中挑选一个或多个作为write_mode',
9696
mode)
9797
sys.exit()
9898

weibo_spider/parser/comment_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def get_long_weibo(self):
3333
# 3. 去掉所有 HTML 标签,但保留标签内的有效文本
3434
new_content = fromstring(html_string).text_content()
3535
# 4. 替换多个连续的 \n 为一个 \n
36-
new_content = re.sub(r'\n+', '\n', new_content)
36+
new_content = re.sub(r'\n+\s*', '\n', new_content)
3737
weibo_content = handle_garbled(new_content)
3838
if weibo_content is not None:
3939
return weibo_content

weibo_spider/spider.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ def __init__(self, config):
7474
self.sqlite_config = config.get('sqlite_config')
7575
self.kafka_config = config.get('kafka_config')
7676
self.mongo_config = config.get('mongo_config')
77+
self.post_config = config.get('post_config')
7778
self.user_config_file_path = ''
7879
user_id_list = config['user_id_list']
7980
if FLAGS.user_id_list:
@@ -284,6 +285,11 @@ def initialize_info(self, user_config):
284285

285286
self.writers.append(KafkaWriter(self.kafka_config))
286287

288+
if 'post' in self.write_mode:
289+
from .writer import PostWriter
290+
291+
self.writers.append(PostWriter(self.post_config))
292+
287293
self.downloaders = []
288294
if self.pic_download == 1:
289295
from .downloader import (OriginPictureDownloader,

weibo_spider/writer/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,6 @@
55
from .txt_writer import TxtWriter
66
from .sqlite_writer import SqliteWriter
77
from .kafka_writer import KafkaWriter
8+
from .post_writer import PostWriter
89

9-
__all__ = [CsvWriter, TxtWriter, JsonWriter, MongoWriter, MySqlWriter, SqliteWriter, KafkaWriter]
10+
__all__ = [CsvWriter, TxtWriter, JsonWriter, MongoWriter, MySqlWriter, SqliteWriter, KafkaWriter, PostWriter]

weibo_spider/writer/post_writer.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import codecs
2+
import json
3+
import logging
4+
import os
5+
import requests
6+
7+
from .writer import Writer
8+
from time import sleep
9+
from requests.exceptions import RequestException
10+
11+
logger = logging.getLogger('spider.post_writer')
12+
13+
class PostWriter(Writer):
14+
def __init__(self, post_config):
15+
self.post_config = post_config
16+
self.api_url = post_config['api_url']
17+
self.api_token = post_config.get('api_token', None)
18+
self.dba_password = post_config.get('dba_password', None)
19+
20+
def write_user(self, user):
21+
self.user = user
22+
23+
def _update_json_data(self, data, weibo_info):
24+
"""将获取到的微博数据转换为json输出模式一致"""
25+
data['user'] = self.user.__dict__
26+
if data.get('weibo'):
27+
data['weibo'] += weibo_info
28+
else:
29+
data['weibo'] = weibo_info
30+
return data
31+
32+
def send_post_request_with_token(self, url, data, token, max_retries, backoff_factor):
33+
headers = {
34+
'Content-Type': 'application/json',
35+
'api-token': f'{token}',
36+
}
37+
for attempt in range(max_retries + 1):
38+
try:
39+
response = requests.post(url, json=data, headers=headers)
40+
if response.status_code == requests.codes.ok:
41+
return response.json()
42+
else:
43+
raise RequestException(f"Unexpected response status: {response.status_code}")
44+
except RequestException as e:
45+
if attempt < max_retries:
46+
sleep(backoff_factor * (attempt + 1)) # 逐步增加等待时间,避免频繁重试
47+
continue
48+
else:
49+
logger.error(f"在尝试{max_retries}次发出POST连接后,请求失败:{e}")
50+
51+
def write_weibo(self, weibos):
52+
"""将爬到的信息POST到API"""
53+
data = {}
54+
data = self._update_json_data(data, [w.__dict__ for w in weibos])
55+
if data:
56+
self.send_post_request_with_token(self.api_url, data, self.api_token, 3, 2)
57+
logger.info(u'%d条微博通过POST发送到 %s', len(weibos), self.api_url)
58+
else:
59+
logger.info(u'没有获取到微博,略过API POST')

0 commit comments

Comments
 (0)