-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathuniScraperSingleTable.py
96 lines (83 loc) · 5.02 KB
/
uniScraperSingleTable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#NOTE: Must have beautiful soup installed
#Added intentional delay so as not to spam server with page requests all at once
#No delay specified in the U of A robots.txt, so the delay is set to 2 seconds.
#Please do not edit or remove, as spamming the server may hypothetically lead to an IP block or
#future bot blocking measures
import csv
import time
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs
UA_ROOT_URL = 'https://catalogue.ualberta.ca'
#First crawl the course faculty page and gather the faculty link as well as the faculty name for each given
#faculty
try:
courseFacultyPage = urlopen(UA_ROOT_URL+'/Course')
except:
raise Exception('Error. Url could not be opened. This probably means that the targeted website has moved been shut down.')
time.sleep(2)
courseFacultySoup = bs(courseFacultyPage, 'html.parser')
facultyTable = courseFacultySoup.find('table',{'class':'pure-table pure-table-striped'})
faculties = facultyTable.findAll('td')
facultyLinks = []
for faculty in faculties:
facultyLinks.append((faculty.find('a').contents[0], faculty.find('a').get('href')))
#print(facultyLinks)
try:
with open('UAlbertaCoursesSingleTable.csv','w',newline='') as csvfile:
#Create csv table header
csvWriter = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
csvWriter.writerow(['facultyName','subjectName','courseLetters','courseNumbers','courseTitle','courseSummary'])
print('Webcrawler is now running. To ensure crawler is not spamming the server, a hard coded delay is in place. '
'Please do not modify/remove the delay, as this will slow down the target server and can lead to reprocussions. '
'This should several minutes, please be patient as it runs. The program will never freeze.')
#Now use the faculty link stored in memory to traverse the given page to now gather the
#course subjects' codename, name, and link.
for facultyName, facultyURL in facultyLinks:
courseSubjectPage = urlopen(UA_ROOT_URL+facultyURL)
time.sleep(2)
courseSubjectSoup = bs(courseSubjectPage, 'html.parser')
subjectTable = courseSubjectSoup.find('table',{'class':'pure-table pure-table-striped'})
subjectRows = subjectTable.findAll('tr')
#Clear out previous subjects and dictionaries. Dictionaries are to remember each subjectLink's name and code.
subjectLinks = []
subjectLongNameDict = {}
subjectCodeDict = {}
for subjectRow in subjectRows:
subjectCols = subjectRow.findAll('td')
#make sure the row actually has content in it, since the web page can give blank rows 🙄
if(len(subjectCols)==2):
#Get the subject code , name, and the link of each subject in the faculty page.
#Links will be opened in order to find all courses in each subject.
subjectATag = subjectCols[0]
subjectLink = subjectATag.find('a').get('href')
subjectLongNameDict[subjectLink] = subjectCols[1].contents[0].strip()
subjectCodeDict[subjectLink] = subjectATag.find('a').contents[0]
subjectLinks.append(subjectATag.find('a').get('href'))
#Finally take each subject URL to get all the courses within that given subject
for subjectLink in subjectLinks:
coursePage = urlopen(UA_ROOT_URL+subjectLink)
time.sleep(2)
courseSoup = bs(coursePage,'html.parser')
courseDivs = courseSoup.findAll('div',{'class':'claptrap-course'})
for courseDiv in courseDivs:
#Get course code and split it up to find the course number ONLY
courseCode = courseDiv.find('span',{'class':'claptrap-course-number'}).contents[0].strip()
courseNumber = ''.join(filter(str.isdigit,courseCode))
courseTitle = courseDiv.find('span',{'class':'claptrap-course-title'}).contents[0].strip()
courseSummaryPTag = courseDiv.find('p')
#If no Description is available
if courseSummaryPTag is None:
courseSummary = 'No description available for this course.'
else:
courseSummary = courseDiv.find('p').contents[2].strip()
#Fill in course info as a csv row.
csvWriter.writerow([facultyName,subjectLongNameDict[subjectLink],subjectCodeDict[subjectLink],courseNumber,courseTitle,courseSummary])
print(facultyName,subjectLongNameDict[subjectLink],subjectCodeDict[subjectLink],courseNumber,courseTitle)
print('.')
print('Done!')
except Exception as e:
print(e)
print('Error. An exception has occurred while scraping.'
'It is very likely that the page layout has been'
'altered since this script has been made. A new script'
'may be needed to scrape this page.')