-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBGG-BS.py
65 lines (52 loc) · 1.8 KB
/
BGG-BS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import pandas as pd
import urllib.parse
from bs4 import BeautifulSoup
import requests
import json
import time
df = pd.read_csv('game_data.csv')
game_names = set([x.replace(' Rules', '') for x in df['Title']])
print(len(game_names))
all_dicts = []
for g in game_names:
game = {'Title': g}
enc = urllib.parse.quote_plus(g)
search_url = 'https://boardgamegeek.com/geeksearch.php?action=search&objecttype=boardgame&q={}&B1=Go'.format(
enc)
print(search_url)
res = requests.get(search_url).text
soup = BeautifulSoup(res, 'html5lib')
first_result = soup.find('tr', {'id': 'row_'})
try:
metadata = [
x.text.strip().replace(
'\n',
' ').replace(
'\t',
'').replace(
' ',
' ') for x in first_result.find_all('td')]
game['rank'], game['name'], game['geek_rating'], game[
'avg_rating'], game['voters'] = [metadata[0]] + metadata[2:-1]
sub_url = 'https://boardgamegeek.com' + \
first_result.find_all('td')[2].find('a')['href']
for l in requests.get(sub_url).text.split('\n'):
if l.strip().startswith('GEEK.geekitemPreload'):
data = json.loads(l.strip()[23:-1])
game = {**game, **data['item']['stats']}
all_dicts.append(game)
json.dump(all_dicts, open('all_dicts.json', 'w'))
time.sleep(1)
except:
all_dicts.append(game)
json.dump(all_dicts, open('all_dicts.json', 'w'))
time.sleep(1)
df2 = pd.DataFrame(all_dicts)
match = []
for t in df2['Title']:
for o in df['Title']:
if o.startswith(t):
match.append(o)
break
df2['Title'] = match
df.merge(df2, on=('Title')).to_csv('game_data_with_bgg.csv', index=False)