-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
34 lines (29 loc) · 1.13 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import requests
from bs4 import BeautifulSoup
from datetime import datetime
TIME_PRESET = "%Y-%m-%dT%H:%M:%S"
def scrape_website(website_url):
page = requests.get(website_url)
# parse the html using beautiful soup and store in variable `soup`
soup = BeautifulSoup(page.text, 'html.parser')
# Take out the <div> of name and get its value
name_box = soup.find_all('td', attrs={'class': 'has-events'})
events = []
for item in name_box:
mini_soup = BeautifulSoup(str(item), "html.parser")
if mini_soup.time is None:
continue
event_date = mini_soup.time['datetime']
event_datetime = datetime.strptime(event_date, TIME_PRESET)
artist = mini_soup.find(attrs={'class': 'longdesc'}).text
try:
# artist = str(artist.replace("\n", "").encode("utf-8"))
artist = str(artist.replace("\n", ""))
except UnicodeEncodeError as e:
print("Couldn't parse artist - %s" % e.message)
event_body = {
"artist": artist,
"datetime": str(event_datetime),
}
events.append(event_body)
return events