-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathResidentAdvisor-selenium.py
103 lines (81 loc) · 3.25 KB
/
ResidentAdvisor-selenium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from selenium import webdriver # powers the browser interaction
from selenium.webdriver.support.ui import Select # selects menu options
from bs4 import BeautifulSoup # to parse HTML
import csv # to write CSV
import pandas as pd # to see CSV
import time
import os
import random
driver = webdriver.PhantomJS()
next_page = "https://www.residentadvisor.net/reviews.aspx?format=single"
with open("resident-adv.csv", "a") as f:
csv_w_interv = csv.writer(f)
csv_w_interv.writerow(["title",
"artist",
"single",
"label",
"record",
"style",
"reviewed_date",
"release_date",
"comments",
"rating",
"description",
"URL"])
for i in range(10000):
driver.get(next_page)
soup = BeautifulSoup(driver.page_source, "html5lib")
try:
next_page = "https://www.residentadvisor.net/" + \
soup.find("li", {"class": "but arrow-left bbox"}).find("a")['href']
except:
next_page = ""
singles = soup.find(
"div", {
"id": "reviews"}).find_all(
"article", {
"class": "highlight-top"})
review_links = [
'https://www.residentadvisor.net' +
x.find("a")['href'] for x in singles]
if i == 0:
review_links = review_links[25:]
for l in review_links:
driver.get(l)
soup = BeautifulSoup(driver.page_source, 'html5lib')
title = soup.find("div", {"id": "sectionHead"}).find("h1").text.strip()
try:
artist = title.split("-")[0].strip()
single = title.split("-")[1].strip()
except:
artist = ''
single = ''
print(title)
rating = soup.find("span", {"class": "rating"}).text.split("/")[0]
reviewed_date = soup.find("span", {"itemprop": "dtreviewed"})[
'datetime'].strip()
meta_list = soup.find("ul", {"class": "clearfix"}).find_all("li")
style = meta_list[2].text.split('\n')[4]
label = str(meta_list[0]).split(
'<br/>')[0].split('">')[-1].split('</')[0].strip()
record = str(meta_list[0]).split('<br/>')[-1].split("</")[0].strip()
release_date = meta_list[1].text.split('\n')[4]
comments = meta_list[3].text.split('\n')[4].split("/")[0].strip()
description = soup.find("span",
{"itemprop": "description"}).text.strip()
with open("resident-adv.csv", "a") as f:
csv_w_interv = csv.writer(f)
csv_w_interv.writerow([title,
artist,
single,
label,
record,
style,
reviewed_date,
release_date,
comments,
rating,
description,
l])
time.sleep(random.randint(1, 3))
time.sleep(random.randint(1, 3))