-
Notifications
You must be signed in to change notification settings - Fork 0
/
tavenner.py
executable file
·95 lines (74 loc) · 2.55 KB
/
tavenner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python
# -*- coding: utf-8 -*-
print "Do not run, per https://github.com/openva/tavenner/issues/11"
sys.exit()
import os
import errno
import zipfile
import sys
import hashlib
import time
output_dir = 'output'
def main():
# Make our output directory, if it doesn't exist.
if not os.path.exists(output_dir):
os.makedirs(output_dir)
else:
dir = os.listdir(output_dir)
if len(dir) > 0:
resume_point = int(dir[-1].replace('.html', '')) + 1
# If we have a resume point file, resume from that number.
try:
with open('.resume') as f:
tmp = int(f.read())
if (tmp > resume_point):
resume_point = tmp
from urllib2 import Request, urlopen, HTTPError
url_base = 'http://ethicssearch.dls.virginia.gov/ViewFormBinary.aspx?filingid='
errors = 0
try:
i = resume_point
except:
i = 2050
print "Resuming at " + str(i)
while True:
# Note the time at which we started reading this record, to make sure that we don't make
# more than one request per second. (Too-frequent requests result in being blocked.)
start_time = time.time()
req = Request(url_base + str(i))
try:
f = urlopen(req)
# If the content is long enough to be legitimate.
if int(f.headers['content-length']) > 590:
sys.stdout.write('.')
sys.stdout.flush()
errors = 0
# Save the file.
filename = output_dir + '/' + str(i).zfill(6) + '.html'
local_file = open(filename, 'w')
local_file.write(f.read())
local_file.close()
# If the content is short, indicating a blank page.
else:
sys.stdout.write(' ')
sys.stdout.flush()
errors += 1
# If there's an HTTP error, record that.
except HTTPError as e:
sys.stdout.write('X')
sys.stdout.flush()
errors += 1
# Increment our counter.
i += 1
# If we get 50 errors in a row, count by 10s.
if errors >= 50:
i += 9
# If we get 200 errors in a row, quit.
if errors == 200:
print "Too many consecutive errors encountered—stopping"
break
# Don't query more than once every two seconds.
if time.time() - start_time < 2:
time.sleep(2 - (time.time() - start_time))
if __name__ == "__main__":
main()