Skip to content

Commit 5db7f7c

Browse files
committed
script for downloading stamps
1 parent 67ebaa1 commit 5db7f7c

File tree

2 files changed

+40
-3
lines changed

2 files changed

+40
-3
lines changed

beehive_scraper.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import json
22
import bs4
33
from bs4 import BeautifulSoup
4-
import urllib
4+
import requests
55

66
def parse_single_release(s):
77
release = {}
@@ -16,11 +16,11 @@ def parse_single_release(s):
1616
if __name__ == '__main__':
1717
for i in range(277):
1818
base = 'https://www.beehive.govt.nz'
19-
data = urllib.request.urlopen(base + '/releases?page={}'.format(i))
19+
data = requests.get(base + '/releases?page={}'.format(i))
2020
soup = BeautifulSoup(data)
2121
paths = [s['href'] for s in soup.find_all('a') if '/release/' in s['href']]
2222
paths = [base+p for i, p in enumerate(paths) if i%2==0]
23-
releases = [BeautifulSoup(urllib.request.urlopen(p)) for p in paths]
23+
releases = [BeautifulSoup(requests.get(p)) for p in paths]
2424
parsed_releases = [parse_single_release(s) for s in releases]
2525
for rel in parsed_releases:
2626
print('writing {}'.format(rel['title']))

download_stamps.py

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import requests
2+
import os
3+
import shutil
4+
import sys
5+
6+
def copy_image(image_url, download_path):
7+
r = requests.get(image_url, stream=True)
8+
if r.status_code == 200:
9+
with open(download_path, 'wb') as f:
10+
r.raw.decode_content = True
11+
shutil.copyfileobj(r.raw, f)
12+
13+
def get_images(url):
14+
if url is None:
15+
return []
16+
r = requests.get(url)
17+
if r.status_code == 200:
18+
return [result['large_thumbnail_url'] for result in r.json()["search"]["results"]]
19+
return []
20+
21+
api_key = "eJesEUUomq_zGoW9nBAW"
22+
url = "http://api.digitalnz.org/v3/records.json?api_key={api_key}&text=stamp&and[category][]=Images&per_page=200".format(api_key=api_key)
23+
page = sys.argv[1]
24+
25+
url += "&page=%s" % page
26+
27+
folder = "image_collection"
28+
if not os.path.exists(folder):
29+
os.mkdir(folder)
30+
31+
for index, thumb in enumerate(get_images(url)):
32+
print "downloading:", thumb
33+
try:
34+
copy_image(thumb, os.path.join(folder, "%s_%s.jpg" % (page, index)))
35+
except Exception as error:
36+
print "Error downloading image %s:" % thumb, error
37+
continue

0 commit comments

Comments
 (0)