-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRioGrandeGames-selenium.py
77 lines (60 loc) · 2.59 KB
/
RioGrandeGames-selenium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from selenium import webdriver # powers the browser interaction
from selenium.webdriver.support.ui import Select # selects menu options
from bs4 import BeautifulSoup # to parse HTML
import csv # to write CSV
import pandas as pd # to see CSV
import time
import os
import random
import requests
import pickle
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'])
driver.get('http://riograndegames.com/search.html?category%5B%5D=5&category%5B%5D=10&category%5B%5D=14&category%5B%5D=1&category%5B%5D=2&category%5B%5D=12&category%5B%5D=3&category%5B%5D=6&category%5B%5D=8&category%5B%5D=9&category%5B%5D=4&category%5B%5D=13&category%5B%5D=22&category%5B%5D=16&category%5B%5D=11&category%5B%5D=7&category%5B%5D=17&category%5B%5D=18&category%5B%5D=15&language=0&min_players=0&length=0&min_age=0&term=')
search_results = driver.find_element_by_css_selector(
'div#search_results.isotope').find_elements_by_css_selector('div.search_item.isotope-item')
games_dicts = []
attributes = [
'data-title',
'data-orig',
'data-length',
'data-date',
'data-age',
'data-players',
'data-msrp']
for s in search_results:
game = {}
for a in attributes:
game[a] = s.get_attribute(a)
game['page_link'] = s.find_element_by_css_selector(
'a').get_attribute('href')
games_dicts.append(game)
final_games_dicts = []
for g in games_dicts:
print(g['data-title'])
driver.get(g['page_link'])
cats = driver.find_elements_by_css_selector('span.game_cat')
cats = [c.text.replace(',', '') for c in cats]
g['game_category'] = ';'.join(cats)
# unfold and download
driver.find_element_by_css_selector('span.button2').click()
asset_links = driver.find_elements_by_css_selector('p.asset_list a')
for a in asset_links:
images = a.find_elements_by_css_selector("img")
for i in images:
if "rules" in i.get_attribute('title').lower():
download = a.get_attribute('href')
session = requests.Session()
cookies = driver.get_cookies()
for cookie in cookies:
session.cookies.set(cookie['name'], cookie['value'])
response = session.get(download)
dl_path = 'pdfs/' + g['data-title'] + '.pdf'
with open(dl_path, 'wb') as f:
f.write(response.content)
g['pdf_path'] = dl_path
final_games_dicts.append(g)
pickle.dump(final_games_dicts, open('game_dicts.pkl', 'wb'))
time.sleep(1)
break
break
time.sleep(1)