-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper_hol.py
60 lines (55 loc) · 3.27 KB
/
scraper_hol.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from bs4 import BeautifulSoup
#created externally for parsing and manipulating HTML
import urllib.request
#internal library for making requests
import shutil
#python standard library for working with files
base_link = "http://www.parliament.uk"
expenses_base = base_link + "/mps-lords-and-offices/members-allowances/house-of-lords/holallowances/hol-expenses04/"
expense_years = [
"201415",
"201314",
"201213",
"201112",
"201011",
"200102-to-200809"
]
#this function downloads the contents of eachlink with a month (which is an excel file) and puts it into the newly created docs with the name from the link
def download_sheet(doc_url):
absolute_url = base_link + doc_url
file_name = absolute_url.rsplit('/', 1)[-1]
# we do this to create a name for each file
#.rsplit is a method on a string in python. '/' is by what you split in this case. It brings back a list of items in between /.
# [-1] means that we get the last bit - it will be our file name
with urllib.request.urlopen(absolute_url) as response, open(file_name, 'wb') as out_file:
#with ..as..as is an easier way to do this operation, smth built-in python
#urllib.request.urlopen(absolute_url) grabs the contents of the excel file.
# open is standard python function that opens a given file on the computer or creates a new one with the given name if doesnt exist. 'wb' - just googled
shutil.copyfileobj(response, out_file)
#built-in function with a longer name - shutil.copyfileobj
# this function takes the content of the response variable (which is stored in memory only and is the content of an excel file)
#and puts it into the newly-created out-file variable (which exists physically on the computer)
#this little function grabs the content of the html files we provide, reads it (a urllib thing to get the html body) and parses into a special BS format
def getHTML(url):
html_string = urllib.request.urlopen(url).read()
# urllib.request.ulropen is a built-in method that grabs the content of all the html (it can any file type, like xlsx, image...) while .read only takes contents of the Body.
# This will return a string.
return BeautifulSoup(html_string, 'html.parser')
# BeautifulSoup parses the content to make it easier to work with - it creates its own type of data format similar to dictionary
#'html.parser' is a built-in thing that you put after the name of the variable that contains a string with all contents.
#this function gets the parsed content of the HTML for each year link (see above), then BS finds all tags with a link inside
#and within it gets links to each month (they alway follow 'href' like in a key value pair), then chooses only the ones that end in xlsx, and runds next function - download
def get_doc_links(url):
html = getHTML(url)
document_links = html.find_all('a', class_="document")
# .find_all is a method from BeautifulSoup that finds exlusively all html tags, in this case <a class="document"></a>
# the output will still be in the special BS format
for link_tag in document_links:
link = link_tag['href']
#href is an attribute of an html element within a tag
if link[-4:] == "xlsx":
download_sheet(link)
#this block creates links containing each year and rund get_doc_links function against them (each)
for year in expense_years:
url = expenses_base + year
get_doc_links(url)