-
Notifications
You must be signed in to change notification settings - Fork 12
/
scrape.py
56 lines (48 loc) · 1.84 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import re
import os
import passwords
import sys
from subprocess import Popen
from mechanize import Browser
from BeautifulSoup import BeautifulSoup
from multiprocessing import Pool
def download(work):
if os.path.exists(work[1]):
print "Already downloaded", work
return
print "Starting", work
# Put it in a temp file
if os.system("mimms -c %s %s" % (work[0], "_" + work[1])) == 0:
# Move the file on success
os.system("mv %s %s" % ("_" + work[1], work[1]))
print "Finished", work
if __name__ == '__main__':
# Pretend we're just a regular old user (this is naughty, don't try this at home kids)
br = Browser()
br.addheaders = [('User-agent', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6; en-us) AppleWebKit/531.9 (KHTML, like Gecko) Version/4.0.3 Safari/531.9')]
br.set_handle_robots(False)
br.open("https://myvideosu.stanford.edu/oce/currentquarter.aspx")
assert br.viewing_html()
# Import from a module outside of version control your SUNET id and password
br.select_form(name="login")
br["username"] = passwords.my_username
br["password"] = passwords.my_password
# Open the course page for the title you're looking for
response = br.submit()
response = br.follow_link(text=sys.argv[1])
# Build up a list of lectures
links = []
for link in br.links(text="WMP"):
links.append(re.search(r"'(.*)'",link.url).group(1))
videos = []
# These are done serially purely just to not look suspicious, we could probably parallelize this as well
for link in links:
response = br.open(link)
soup = BeautifulSoup(response.read())
video = soup.find('object', id='WMPlayer')['data']
video = re.sub("http","mms",video)
output_name = re.search(r"[a-z]+[0-9]+[a-z]?/[0-9]+",video).group(0).replace("/","_") + ".wmv"
videos.append((video, output_name))
# Make a thread pool and download 5 files at a time
p = Pool(processes=5)
p.map(download, videos)