Skip to content

Commit 225f54a

Browse files
新增 今日头条 新闻爬取
1 parent 8f8d052 commit 225f54a

File tree

5 files changed

+147
-0
lines changed

5 files changed

+147
-0
lines changed

.DS_Store

4 KB
Binary file not shown.

TouTiao/README.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# 说明
2+
3+
## 关于本程序
4+
5+
- 今日头条爬虫,仓促之下写成,代码不漂亮,但有用。
6+
7+
- 里面有代理部分,可自己修改。
8+
9+
## 关于作者
10+
11+
本人从事 `大数据``数据分析` 工作,欢迎各位大牛叨扰~
12+
13+
- github : [https://github.com/SoliDeoGloria31](https://github.com/SoliDeoGloria31)
14+
15+
- 码云 Gitee : [https://gitee.com/joseph31](https://gitee.com/joseph31)
16+
17+
- 微信 : mortaltiger
18+
19+
<img src="https://gitee.com/joseph31/picture_bed/raw/master/mortaltiger.jpg" width="15%">
20+
21+
- 个人公众号: JosephNest(Joseph 的小窝)
22+
经常测试新功能导致服务器不稳定,可能会出故障, 实现`自动推荐系统``自动回复功能``需求留言功能``人工智能集成(图片识别)``其他功能定制`
23+
24+
<img src="https://gitee.com/joseph31/picture_bed/raw/master/JosephNest.jpg" width="15%">

TouTiao/pictures/JosephNest.jpg

39 KB
Loading

TouTiao/pictures/mortaltiger.jpg

38.1 KB
Loading

TouTiao/toutiao.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import requests
2+
from requests.exceptions import ConnectionError
3+
from lxml import etree
4+
5+
import time
6+
from selenium import webdriver
7+
8+
from selenium.webdriver.chrome.options import Options
9+
import csv
10+
import pandas as pd
11+
from urllib.parse import quote
12+
import re
13+
from fake_useragent import UserAgent
14+
import random
15+
16+
base_url = 'https://www.toutiao.com/api/search/content/'
17+
timestamp = int(time.time()*1000)
18+
19+
ua = UserAgent(verify_ssl=False)
20+
article_url_list = []
21+
csv_name = pd.read_csv("typhoon_toutiao.csv")
22+
23+
page_urls = ["http://dev.kdlapi.com/testproxy",
24+
"https://dev.kdlapi.com/testproxy",
25+
]
26+
27+
# 隧道服务器
28+
tunnel_host = "tps189.kdlapi.com"
29+
tunnel_port = "15818"
30+
31+
# 隧道用户名密码
32+
tid = "t17888082960619"
33+
password = "gid72p4o"
34+
35+
proxies = {
36+
"http": "http://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port),
37+
"https": "https://%s:%s@%s:%s/" % (tid, password, tunnel_host, tunnel_port)
38+
}
39+
40+
# 防止重复
41+
constract_list = []
42+
43+
# 获取到一个页面内所有的article url
44+
45+
46+
def get_article_urls(name):
47+
decde = quote(name)
48+
referer = 'https://www.toutiao.com/search/?keyword='+decde
49+
for offset in range(0, 120, 20): # 搜索结果有10个页面,所以只120,有时页面没这么多
50+
params = {
51+
'aid': 24,
52+
'app_name': 'web_search',
53+
'offset': offset,
54+
'format': 'json',
55+
'keyword': name,
56+
'autoload': 'true',
57+
'count': 20,
58+
'en_qc': 1,
59+
'cur_tab': 1,
60+
'from': 'search_tab',
61+
'pd': 'synthesis',
62+
'timestamp': timestamp
63+
}
64+
headers = {
65+
'cookie': 'tt_webid=6781305717874820616; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6781305717874820616; s_v_web_id=59cfa658a89df645e8a82f1618a81bd0; __tasessionId=g8ptymp5v1579144106433',
66+
'user-agent': ua.random,
67+
'x-requested-with': 'XMLHttpRequest',
68+
'referer': referer,
69+
}
70+
html = requests.get(url=base_url, params=params,
71+
headers=headers, proxies=proxies)
72+
result = list(html.json().get('data'))
73+
for item in result:
74+
article_url = item.get('article_url') # 提取每篇文章的url
75+
if article_url and len(article_url) < 100 and (".mp4" not in article_url) and "toutiao.com" in article_url:
76+
if '/group/' in article_url:
77+
article_url = article_url.replace(
78+
'/group/', '/a').replace('http://', 'https://www.')
79+
article_url_list.append(article_url)
80+
print(article_url)
81+
82+
83+
def request_AND_storage(name):
84+
filename = name+".csv"
85+
try:
86+
get_article_urls(name)
87+
except Exception as e:
88+
print(e)
89+
90+
browser = webdriver.Chrome()
91+
92+
time.sleep(2)
93+
for url in article_url_list:
94+
print(url)
95+
try:
96+
browser.get(url)
97+
time.sleep(1)
98+
text_res = browser.find_element_by_xpath(
99+
'//div[@class="article-box"]')
100+
print(text_res)
101+
text_res = text_res.text
102+
print(text_res)
103+
with open(filename, 'a', encoding='utf-8') as f:
104+
writer = csv.writer(f)
105+
L = [name, text_res]
106+
writer.writerow(L)
107+
except:
108+
continue
109+
110+
browser.close()
111+
112+
113+
if __name__ == '__main__':
114+
try:
115+
request_AND_storage('武汉疫情')
116+
article_url_list = []
117+
time.sleep(10)
118+
except Exception as e:
119+
print(e)
120+
article_url_list = []
121+
time.sleep(1)
122+
continue
123+

0 commit comments

Comments
 (0)