-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreddit_map_downloader.py
105 lines (96 loc) · 2.61 KB
/
reddit_map_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import requests
import json
import urllib
import urllib2
import csv
from mods import ImgrCheck, DrawResults, AlbumCheck
def getContentType(pageUrl):
try:
page = urllib2.urlopen(pageUrl)
pageHeaders = page.headers
contentType = pageHeaders.getheader('content-type')
return contentType
except:
print "CANNOT OPEN URL, IMAGE NOT SAVED"
return 'bool'
r = requests.get('http://www.reddit.com/r/mapporn/top/.json?sort=top&t=day&limit=30')
r.text
data = r.json()
iter_len = len(data['data']['children'])
imgsave = 0
nosave = 0
#Opens the file for reading to check dups
with open('archive.csv', 'rb') as archive:
reader = csv.reader(archive)
with open('archive.csv', 'a+') as arc:
#loop runs the length of the query
for i in range(iter_len):
#parsing json dicts
url = data['data']['children'][i]['data']['url']
name = data['data']['children'][i]['data']['id']
title = data['data']['children'][i]['data']['title']
domain = data['data']['children'][i]['data']['domain']
#getting length of list to extract string
urllen = len(url)
#webp is a string form of the list url
webp = str(url[0:urllen])
print "Url: " + "\n" + "-----" + url
ext = getContentType(url)
isDuplicate = False
#Check for dups
for row in reader:
q = str(row)
if webp in q:
print "Duplicate!"
isDuplicate = True
nosave += 1
continue
if isDuplicate == False or 'bool' in ext:
#checks filetype. Will only save jpegs and pngs, not full webpages or other image types
if "jpeg" in ext:
try:
urllib.urlretrieve(url, name + ".jpg")
print "Image Saved"
imgsave += 1
except:
print "Web Error"
nosave += 1
continue
elif "png" in ext:
try:
urllib.urlretrieve(url, name + ".png")
print "Image Saved"
imgsave += 1
except:
print "Web Error"
nosave += 1
continue
elif ImgrCheck(domain, ext, url) == True:
url2 = url + ".jpg"
try:
urllib.urlretrieve(url2, name + ".jpg")
print "Image Saved"
imgsave += 1
except:
print "Web Error"
nosave += 1
continue
else:
print "THIS IS NOT A SUPPORTED TYPE, NOT SAVED"
print AlbumCheck(url)
print "EXT" , ext
nosave += 1
continue
arcwriter = csv.writer(arc, delimiter=' ')
arcwriter.writerow( [url] )
if isDuplicate == True:
print "This is a duplicate image, not saving"
results = DrawResults(imgsave, nosave)
quit = False
while quit == False:
input = raw_input("Do you want to quit? Y or N: ")
if input == 'y' or input == 'Y':
quit = True
continue
else:
continue