-
Notifications
You must be signed in to change notification settings - Fork 8
/
NaverBlogCrawler.py
112 lines (81 loc) · 5.23 KB
/
NaverBlogCrawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import re
import json
import math
import datetime
import requests
import urllib.request
import urllib.error
import urllib.parse
from bs4 import BeautifulSoup
naver_client_id = "INPUT YOUR CLIENT ID"
naver_client_secret = "INPUT YOUR CLIENT SECRET"
def naver_blog_crawling(search_blog_keyword, display_count, sort_type):
search_result_blog_page_count = get_blog_search_result_pagination_count(search_blog_keyword, display_count)
get_blog_post(search_blog_keyword, display_count, search_result_blog_page_count, sort_type)
def get_blog_search_result_pagination_count(search_blog_keyword, display_count):
encode_search_keyword = urllib.parse.quote(search_blog_keyword)
url = "https://openapi.naver.com/v1/search/blog?query=" + encode_search_keyword
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id", naver_client_id)
request.add_header("X-Naver-Client-Secret", naver_client_secret)
response = urllib.request.urlopen(request)
response_code = response.getcode()
if response_code is 200:
response_body = response.read()
response_body_dict = json.loads(response_body.decode('utf-8'))
if response_body_dict['total'] == 0:
blog_pagination_count = 0
else:
blog_pagination_total_count = math.ceil(response_body_dict['total'] / int(display_count))
if blog_pagination_total_count >= 1000:
blog_pagination_count = 1000
else:
blog_pagination_count = blog_pagination_total_count
print("키워드 " + search_blog_keyword + "에 해당하는 포스팅 수 : " + str(response_body_dict['total']))
print("키워드 " + search_blog_keyword + "에 해당하는 블로그 실제 페이징 수 : " + str(blog_pagination_total_count))
print("키워드 " + search_blog_keyword + "에 해당하는 블로그 처리할 수 있는 페이징 수 : " + str(blog_pagination_count))
return blog_pagination_count
def get_blog_post(search_blog_keyword, display_count, search_result_blog_page_count, sort_type):
encode_search_blog_keyword = urllib.parse.quote(search_blog_keyword)
for i in range(1, search_result_blog_page_count + 1):
url = "https://openapi.naver.com/v1/search/blog?query=" + encode_search_blog_keyword + "&display=" + str(
display_count) + "&start=" + str(i) + "&sort=" + sort_type
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id", naver_client_id)
request.add_header("X-Naver-Client-Secret", naver_client_secret)
response = urllib.request.urlopen(request)
response_code = response.getcode()
if response_code is 200:
response_body = response.read()
response_body_dict = json.loads(response_body.decode('utf-8'))
for j in range(0, len(response_body_dict['items'])):
try:
blog_post_url = response_body_dict['items'][j]['link'].replace("amp;", "")
get_blog_post_content_code = requests.get(blog_post_url)
get_blog_post_content_text = get_blog_post_content_code.text
get_blog_post_content_soup = BeautifulSoup(get_blog_post_content_text, 'lxml')
for link in get_blog_post_content_soup.select('frame#mainFrame'):
real_blog_post_url = "http://blog.naver.com" + link.get('src')
get_real_blog_post_content_code = requests.get(real_blog_post_url)
get_real_blog_post_content_text = get_real_blog_post_content_code.text
get_real_blog_post_content_soup = BeautifulSoup(get_real_blog_post_content_text, 'lxml')
for blog_post_content in get_real_blog_post_content_soup.select('div#postViewArea'):
blog_post_content_text = blog_post_content.get_text()
remove_html_tag = re.compile('<.*?>')
blog_post_title = re.sub(remove_html_tag, '', response_body_dict['items'][j]['title'])
blog_post_description = re.sub(remove_html_tag, '',
response_body_dict['items'][j]['description'])
blog_post_postdate = datetime.datetime.strptime(response_body_dict['items'][j]['postdate'],
"%Y%m%d").strftime("%y.%m.%d")
blog_post_blogger_name = response_body_dict['items'][j]['bloggername']
blog_post_full_contents = str(blog_post_content_text)
print("포스팅 URL : " + blog_post_url)
print("포스팅 제목 : " + blog_post_title)
print("포스팅 설명 : " + blog_post_description)
print("포스팅 날짜 : " + blog_post_postdate)
print("블로거 이름 : " + blog_post_blogger_name)
print("포스팅 내용 : " + blog_post_full_contents)
except:
j += 1
if __name__ == '__main__':
naver_blog_crawling("파이썬 컨벤션", 100, "sim")