-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
82 lines (63 loc) · 2.21 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/python
# Usage:
# python scraper.py --videoid='<video_id>'
from apiclient.errors import HttpError
from oauth2client.tools import argparser
from apiclient.discovery import build
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
DEVELOPER_KEY = 'key'
def get_comment_threads(youtube, video_id, comments):
threads = []
results = youtube.commentThreads().list(
part="snippet",
videoId=video_id,
textFormat="plainText",
).execute()
#Get the first set of comments
for item in results["items"]:
threads.append(item)
comment = item["snippet"]["topLevelComment"]
text = comment["snippet"]["textDisplay"]
comments.append(text)
#Keep getting comments from the following pages
while ("nextPageToken" in results):
results = youtube.commentThreads().list(
part="snippet",
videoId=video_id,
pageToken=results["nextPageToken"],
textFormat="plainText",
).execute()
for item in results["items"]:
threads.append(item)
comment = item["snippet"]["topLevelComment"]
text = comment["snippet"]["textDisplay"]
comments.append(text)
print "Total threads: %d" % len(threads)
return threads
def get_comments(youtube, parent_id, comments):
results = youtube.comments().list(
part="snippet",
parentId=parent_id,
textFormat="plainText"
).execute()
for item in results["items"]:
text = item["snippet"]["textDisplay"]
comments.append(text)
return results["items"]
if __name__ == "__main__":
argparser.add_argument("--videoid", help="Required; ID for video for which the comment will be inserted.")
args = argparser.parse_args()
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=DEVELOPER_KEY)
try:
output_file = open("output.txt", "w")
comments = []
video_comment_threads = get_comment_threads(youtube, args.videoid, comments)
for thread in video_comment_threads:
get_comments(youtube, thread["id"], comments)
for comment in comments:
output_file.write(comment.encode("utf-8") + "\n")
output_file.close()
print "Total comments: %d" % len(comments)
except HttpError, e:
print "An HTTP error %d occurred:\n%s" % (e.resp.status, e.content)