-
Notifications
You must be signed in to change notification settings - Fork 3
/
vimeo_harvesters.py
157 lines (148 loc) · 3.98 KB
/
vimeo_harvesters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import io
import json
import csv
import os
import time
import requests
import youtube_dl
import dateparser
import sys
from datetime import datetime as dt
from datetime import timezone
sys.path.insert(0, r'C:\Source\secrets_and_credentials')
agent_name = "vimeo_harvesters_1"
def get_channel(item):
"""
Getting video ids from channel
Arguments:
item (obj) - contains row data from spreadsheet
Returns:
item (obj) - contains row data from spreadsheet with "completed" set True or False
"""
print(item.id)
print("Vimeo channel")
video_ids = []
url = item.url
item.agent_name = agent_name+"_get_channel"
storage_folder = item.storage_folder
try:
cwd = os.getcwd()
if not os.path.exists(storage_folder):
os.makedirs(storage_folder)
os.chdir(storage_folder)
if url.endswith("/"):
url = url[:-1]
print("herr2")
ydl_opts = {
'ignoreerrors': True,
'quiet': True
}
ydl = youtube_dl.YoutubeDL(ydl_opts)
result = ydl.extract_info(url, download = False)
if 'entries' in result:
videos = result ['entries']
print(len(videos))
for video in videos:
print(video)
vidid = video["id"]
print(vidid)
if not vidid in video_ids:
upload_date = video["upload_date"]
print(upload_date)
if not item.archived_start_date or upload_date > item.archived_start_date:
video_ids.append(vidid)
print("go to video collector")
flag = video_collector(video_ids, storage_folder, item.id)
os.chdir(cwd)
item.completed = flag
except:
os.chdir(cwd)
item.completed = False
return item
def get_video(item):
"""
Managing collecting of single video
Arguments:
item (obj) - contains row data from spreadsheet
Returns:
item (obj) - contains row data from spreadsheet with "completed" set True or False
"""
print("Vimeo video")
print(item.id)
url = item.url
item.agent_name = agent_name+"_get_video"
storage_folder = item.storage_folder
try:
cwd = os.getcwd()
if not os.path.exists(storage_folder):
os.makedirs(storage_folder)
os.chdir(storage_folder)
if url.endswith("/"):
url = url[:-1]
video_ids= [url.split('/')[-1]]
video_collector(video_ids, storage_folder, item.id)
os.chdir(cwd)
item.completed = True
except:
os.chdir(cwd)
item.completed = False
return item
def video_collector(video_ids, storage_folder ,id):
"""
Collects videos, video infromation, comments information and writes it to json files. Writes errors to error file.
Arguments:
video_ids(list) - list of video_ids to collect
storage_folder(str) - location of folder where to collect
id(str) - unique identifier
Returns:
flag (bool) - true if everything collected, false if any error
"""
print(video_ids)
storage_folder = "."
flag = True
csv_rows = []
for vidid in video_ids:
videos = []
csv_row = []
csv_row.append(id)
csv_row.append(vidid)
csv_row.append(dt.now().strftime('%Y%m%d %H:%M:%S'))
print(csv_row)
url = "https://vimeo.com/"+vidid
ydl = youtube_dl.YoutubeDL({'outtmpl':os.path.join(storage_folder,vidid,'%(id)s.%(ext)s')})
try:
result = ydl.extract_info(url, download = True)
print("checking entries")
if 'entries' in result:
print("here entries")
videos = result ['entries']
else:
videos = result
print("let us write to file")
with open(os.path.join(storage_folder,vidid, vidid+'.json'), 'w') as json_file:
json.dump(videos, json_file)
print("done")
except Exception as e:
print(str(e))
with open(os.path.join(storage_folder,id,'errors_{}.txt'.format(dt.now().strftime('%Y%m%d'))), "a") as f:
f.write( id + " " + str(e) )
f.write("\n")
flag = False
if videos:
if videos == []:
csv_row.append("Fasle")
else:
csv_row.append('True')
else:
csv_row.append("False")
csv_rows.append(csv_row)
print(csv_rows)
print(os.path.join(storage_folder, id+'.csv'))
with open (os.path.join(storage_folder, id+'.csv'), 'a') as f:
csv_writer = csv.writer(f, quoting=csv.QUOTE_NONE)
csv_writer.writerows(csv_rows)
return flag
def main():
pass
if __name__ == '__main__':
main()