forked from wiedi/coursera
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcoursera-dl
executable file
·268 lines (241 loc) · 10 KB
/
coursera-dl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
#!/usr/bin/env python
"""
For downloading lecture resources such as videos for Coursera classes. Given a class name and related cookie file, it scrapes the course listing page to get the section (week) and lecture names, and then downloads the related materials into appropriately named files and directories.
Examples:
coursera-dl -c cookies.txt saas
coursera-dl -c cookies.txt -l listing.html -o saas --skip-download
Author:
John Lehmann (first last at geemail dotcom or @jplehmann)
"""
import sys, os, re, string
import urllib, urllib2, urlparse, cookielib
import tempfile
import subprocess
import argparse
import StringIO
import tempfile
from BeautifulSoup import BeautifulSoup
def get_syllabus_url(className):
"""Return the Coursera index/syllabus URL."""
return "http://class.coursera.org/%s/lecture/index" % className
def get_auth_url(className):
return "http://class.coursera.org/%s/auth/auth_redirector?type=login&subtype=normal&email=&visiting=&minimal=true" % className
def write_cookie_file(className, username, password):
fn = tempfile.mkstemp()[1]
cj = cookielib.MozillaCookieJar(fn)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), urllib2.HTTPHandler())
req = urllib2.Request(get_auth_url(className))
ref = opener.open(req).geturl()
data = urllib.urlencode({'email': username, 'password': password, 'login': 'Login'})
req = urllib2.Request(ref, data)
opener.open(req)
cj.save()
return fn
def load_cookies_file(cookies_file):
"""Loads the cookies file. I am pre-pending the file with the special
Netscape header because the cookie loader is being very particular about
this string."""
cookies = StringIO.StringIO()
NETSCAPE_HEADER = "# Netscape HTTP Cookie File"
cookies.write(NETSCAPE_HEADER);
cookies.write(open(cookies_file, 'r').read())
cookies.flush()
cookies.seek(0)
return cookies
def get_opener(cookies_file):
"""Use cookie file to create a url opener."""
cj = cookielib.MozillaCookieJar()
cookies = load_cookies_file(cookies_file)
# nasty hack: cj.load() requires a filename not a file, but if
# I use stringio, that file doesn't exist. I used NamedTemporaryFile
# before, but encountered problems on Windows.
cj._really_load(cookies, "StringIO.cookies", False, False)
return urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
def get_page(url, cookies_file):
"""Download an HTML page using the cookiejar."""
opener = get_opener(cookies_file)
return opener.open(url).read()
def get_syllabus(class_name, cookies_file, local_page=False):
""" Get the course listing webpage."""
if (not (local_page and os.path.exists(local_page))):
url = get_syllabus_url(class_name)
page = get_page(url, cookies_file)
print "Downloaded %s (%d bytes)" % (url, len(page))
# cache the page if we're in 'local' mode
if (local_page):
open(local_page, 'w').write(page)
else:
page = open(local_page).read()
return page
def clean_filename(s):
"""Sanitize a string to be used as a filename."""
# strip paren portions which contain trailing time length (...)
s = re.sub("\([^\(]*$", "", s)
s = s.strip().replace(':','-').replace(' ', '_')
valid_chars = "-_.()%s%s" % (string.ascii_letters, string.digits)
return ''.join(c for c in s if c in valid_chars)
def get_anchor_format(a):
"""Extract the resource file-type format from the anchor"""
# (. or format=) then (file_extension) then (? or $)
# e.g. "...format=txt" or "...download.mp4?..."
format = re.search("(?:\.|format=)(\w+)(?:\?.*)?$", a)
return format.group(1) if format else None
def parse_syllabus(page):
"""Parses a Coursera course listing/syllabus page.
Each section is a week of classes."""
sections = []
soup = BeautifulSoup(page)
# traverse sections
for stag in soup.findAll(attrs={'class':'list_header'}):
assert stag.string != None, "couldn't find section"
section_name = clean_filename(stag.string)
print section_name
lectures = [] # resources for 1 lecture
# traverse resources (e.g., video, ppt, ..)
for vtag in stag.parent.nextSibling.findAll('li'):
assert vtag.a.contents[0], "couldn't get lecture name"
vname = clean_filename(vtag.a.contents[0])
print " ", vname
lecture = {}
for a in vtag.findAll('a'):
href = a['href']
format = get_anchor_format(href)
print " ", format, href
if format: lecture[format] = href
lectures.append((vname, lecture))
sections.append((section_name, lectures))
print "Found %d sections and %d lectures on this page" % \
(len(sections), sum((len(s[1]) for s in sections)))
if (not len(sections)):
print "Probably bad cookies file (or wrong class name)"
return sections
def download_lectures(
wget_bin,
cookies_file,
class_name,
sections,
file_formats,
overwrite=False,
skip_download=False,
section_filter=None,
lecture_filter=None
):
"""Downloads lecture resources described by sections."""
def format_section(num, section):
return "%s_%02d_%s" % (class_name.upper(), num, section)
def format_resource(num, name, format):
return "%02d_%s.%s" % (num, name, format)
for (secnum, (section, lectures)) in enumerate(sections):
if section_filter and not re.search(section_filter, section):
#print "Skipping b/c of sf: ", section_filter, section
continue
sec = format_section(secnum+1, section)
for (lecnum, (lecname, lecture)) in enumerate(lectures):
if lecture_filter and not re.search(lecture_filter, lecname):
continue
if not os.path.exists(sec):
os.mkdir(sec)
# write lecture resources
for format,url in [i for i in lecture.items() if ((i[0] in file_formats) or "all" in file_formats)]:
lecfn = os.path.join(sec, format_resource(lecnum+1, lecname, format))
print lecfn
if overwrite or not os.path.exists(lecfn):
if not skip_download:
download_file(url, lecfn, cookies_file, wget_bin)
else:
open(lecfn, 'w').close() # touch
def download_file(url, fn, cookies_file, wget_bin):
"""Downloads file and removes current file if aborted by user."""
try:
if wget_bin:
download_file_wget(wget_bin, url, fn, cookies_file)
else:
download_file_nowget(url, fn, cookies_file)
except KeyboardInterrupt, e:
print "\nKeyboard Interrupt -- Removing partial file:", fn
os.remove(fn)
sys.exit()
def download_file_wget(wget_bin, url, fn, cookies_file):
"""Downloads a file using wget. Could possibly use python to stream files to
disk, but wget is robust and gives nice visual feedback."""
cmd = [wget_bin, url, "-O", fn, "--load-cookies", cookies_file, "--no-check-certificate"]
print "Executing wget:", cmd
retcode = subprocess.call(cmd)
def download_file_nowget(url, fn, cookies_file):
"""'Native' python downloader -- slower than wget."""
print "Downloading %s -> %s" % (url, fn)
urlfile = get_opener(cookies_file).open(url)
chunk_sz = 1048576
bytesread = 0
f = open(fn, "wb")
while True:
data = urlfile.read(chunk_sz)
if not data:
print "."
break
f.write(data)
bytesread += len(data)
print "\r%d bytes read" % bytesread,
sys.stdout.flush()
def parseArgs():
parser = argparse.ArgumentParser(description='Download Coursera.org lecture material and resources.')
# positional
parser.add_argument('class_name', action='store',
help='name of the class (e.g. "nlp")')
# required
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('-c', '--cookies_file', dest='cookies_file',
action='store', default=None, help='full path to the cookies.txt file')
group.add_argument('-u', '--username', dest='username',
action='store', default=None, help='coursera username')
parser.add_argument('-p', '--password', dest='password',
action='store', default=None, help='coursera password')
# optional
parser.add_argument('-f', '--formats', dest='file_formats',
action='store', default="all", help='file format extensions to be downloaded in quotes space separated, e.g. "mp4 pdf" (default: special value "all")')
parser.add_argument('-sf', '--section_filter', dest='section_filter',
action='store', default=None, help='only download sections which contain this regex (default: disabled)')
parser.add_argument('-lf', '--lecture_filter', dest='lecture_filter',
action='store', default=None, help='only download lectures which contain this regex (default: disabled)')
parser.add_argument('-w', '--wget_bin', dest='wget_bin',
action='store', default=None, help='wget binary if it should be used for downloading')
parser.add_argument('-o', '--overwrite', dest='overwrite',
action='store_true', default=False,
help='whether existing files should be overwritten (default: False)')
parser.add_argument('-l', '--process_local_page', dest='local_page',
help='for debugging: uses or creates local cached version of syllabus page')
parser.add_argument('--skip-download', dest='skip_download',
action='store_true', default=False,
help='for debugging: skip actual downloading of files')
args = parser.parse_args()
# turn list of strings into list
args.file_formats = args.file_formats.split()
# check arguments
if args.cookies_file and not os.path.exists(args.cookies_file):
print >> sys.stderr, "Cookies file not found: " + args.cookies_file
sys.exit(1)
if args.username and not args.password:
print >> sys.stderr, "Password required when username is specified"
sys.exit(1)
return args
def main():
args = parseArgs()
if args.username:
tmp_cookie_file = write_cookie_file(args.class_name, args.username, args.password)
page = get_syllabus(args.class_name, args.cookies_file or tmp_cookie_file, args.local_page)
sections = parse_syllabus(page)
download_lectures(
args.wget_bin,
args.cookies_file or tmp_cookie_file,
args.class_name,
sections,
args.file_formats,
args.overwrite,
args.skip_download,
args.section_filter,
args.lecture_filter
)
if not args.cookies_file:
os.unlink(tmp_cookie_file)
if __name__ == "__main__":
main()