forked from tfeltwell/Guardian-comment-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapeComments.py
63 lines (50 loc) · 2.29 KB
/
scrapeComments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from bs4 import BeautifulSoup
import urllib
def getHTML(url):
html = urllib.urlopen(url).read()
return BeautifulSoup(html)
def scrapeComments(url):
articleSoup = getHTML(url)
articleTitle = articleSoup.find('h1', class_="content__headline").getText().strip().encode('utf-8')
commentUrl = articleSoup.find(class_='discussion__heading').find('a')['href']
print 'Finding comments for [{0}]({1})\n'.format(articleTitle, url)
commentSoup = getHTML(commentUrl)
paginationBtns = commentSoup.find_all('a', class_='pagination__action')
LastPaginationBtn = commentSoup.find('a', class_='pagination__action--last')
if LastPaginationBtn is not None:
totalPages = int(LastPaginationBtn['data-page'])
elif paginationBtns:
totalPages = int(paginationBtns[-1]['data-page'])
else:
totalPages = 1
def getComments(url):
soup = getHTML(url)
print 'Fetching {0}'.format(url)
commentArray = []
for comment in soup.select('li.d-comment'):
commentObj = {}
commentObj['id'] = comment['data-comment-id']
commentObj['timestamp'] = comment['data-comment-timestamp']
commentObj['author'] = comment['data-comment-author'].encode('utf-8')
commentObj['author-id'] = comment['data-comment-author-id']
# commentObj['reccomend-count'] = comment.find(class_='d-comment__recommend')['data-recommend-count']
body = comment.find(class_='d-comment__body')
if body.blockquote is not None:
body.blockquote.clear()
commentObj['text'] = body.getText().strip().encode('utf-8')
replyTo = comment.find(class_='d-comment__reply-to-author')
if replyTo is not None:
link = replyTo.parent['href'].replace('#comment-', '')
commentObj['reply-to'] = link
else:
commentObj['reply-to'] = ''
commentArray.append(commentObj)
commentArray = commentArray[::-1]
return commentArray
allComments = []
for i in range(totalPages, 0, -1):
params = urllib.urlencode({'page': i})
url = '{0}?={1}'.format(commentUrl, params)
pageComments = getComments(url)
allComments = allComments + pageComments
return allComments