Skip to content

Commit 197ed7a

Browse files
committed
Conflicts: beehive_scraper.py
2 parents 5db7f7c + 57b26ff commit 197ed7a

13 files changed

+2518
-2
lines changed

beehive_scraper.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
import json
22
import bs4
33
from bs4 import BeautifulSoup
4-
import requests
4+
5+
import urllib.request
6+
import time
57

68
def parse_single_release(s):
9+
time.sleep(5)
710
release = {}
811
content = str(s.find_all('div', class_="content"))
912
release['content'] = content.replace('<p>', '').replace('<p>', '').replace('</p>', '').replace('\n', '').replace('\\u', '')
@@ -20,7 +23,12 @@ def parse_single_release(s):
2023
soup = BeautifulSoup(data)
2124
paths = [s['href'] for s in soup.find_all('a') if '/release/' in s['href']]
2225
paths = [base+p for i, p in enumerate(paths) if i%2==0]
23-
releases = [BeautifulSoup(requests.get(p)) for p in paths]
26+
27+
releases = [BeautifulSoup(urllib.request.urlopen(p)) for p in paths]
28+
if len(releases) == 0:
29+
print(soup)
30+
raise SystemExit
31+
2432
parsed_releases = [parse_single_release(s) for s in releases]
2533
for rel in parsed_releases:
2634
print('writing {}'.format(rel['title']))

0 commit comments

Comments
 (0)