-
Notifications
You must be signed in to change notification settings - Fork 0
/
PA-Beaver.py
248 lines (203 loc) · 10.6 KB
/
PA-Beaver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
# from pyquery import PyQuery as pq
import requests
import csv
import re
from collections import OrderedDict
import datetime
from decimal import *
import os
import configuration # Local file
# with open("2019.html", "r") as f:
# html = f.readlines()
# sourceurl = "http://www.beavercountypa.gov/Depts/Elections/Documents/ElectionNightResults/2019_Results_by_Precinct_EL30.htm"
sourceurl = "http://www.beavercountypa.gov/Depts/Elections/Documents/ElectionNightResults/2019_Results_by_Precinct_EL30.htm"
rawtime = datetime.datetime.now()
snapshotsdir = configuration.snapshotsdir
filename = "PA-Beaver.html"
timestamp = datetime.datetime.strftime(rawtime, "%Y%m%d-%H%M%S")
filepath = snapshotsdir + "/" + timestamp + "/"
targetdir = configuration.targetdir
targetfilename = "70-PA-Beaver.csv"
os.makedirs(filepath, exist_ok=True)
kludge = {
"Aliquippa School": ["Repic", "Gilbert"],
"Council Aliquippa": ["Milliner", "West"],
"Blackhawk School": ["Heckathorn"],
"Freedom Area": ["Geibel", "Sherman"],
"Harmony Twp": ["Mosura"],
"Southside School": ["Stewart", "Allison"],
"Council Midland": ["Noto", "Drozdjibob"],
"Raccoon Twp": ["Marshall"]
}
# Download and save the raw file
with open(filepath + filename, "wb") as f:
f.write(requests.get(sourceurl).content)
with open(filepath + filename, "r") as f:
html = f.readlines()
lineheaders = ["id", "raceid", "racetype", "racetypeid", "ballotorder", "candidateid", "description",
"delegatecount", "electiondate", "electtotal", "electwon", "fipscode", "first", "incumbent",
"initialization_data", "is_ballot_measure", "last", "lastupdated", "level", "national",
"officeid", "officename", "party", "polid", "polnum", "precinctsreporting", "precinctsreportingpct",
"precinctstotal", "reportingunitid", "reportingunitname", "runoff", "seatname",
"seatnum", "statename", "statepostal", "test", "uncontested", "votecount", "votepct", "winner"
]
lastupdated = datetime.datetime.strftime(datetime.datetime.now(), "%Y-%m-%dT%H:%M:%S")
precincts = []
precinctrows = None
for row in html[3:-2]: # Strip off pre and HTML tags
if "PRECINCT REPORT" in row:
if not precinctrows: # If we're making our very first pass, we have no precinct info to add to.
precinctrows = []
else:
precincts.append(precinctrows) # When we have a complete precinct, add it in.
precinctrows = []
row = row.rstrip()
precinctrows.append(row)
precincts.append(precinctrows) # Make sure you add in the last one!
def numberatend(row):
return(int(re.search(r"(\d+)$", row).group(1)))
def fetchvotetally(row):
return(int(re.search(r" \.\s+([0-9,]+)", row).group(1).replace(",", "")))
def fetchname(row):
try:
basename = row.split(" . ")[0].strip()
if basename[-1] == ".": # If there's still a period at the end:
basename = basename[:-1]
basename = basename.title()
except: # Fail over by including more
basename = row
return(basename)
"""
Basic approach:
First step is to pull each precinct's results into a separate data structure.
Inside each precinct's results is a semi-predictable data structure. Ish.
There's a header block, then an empty line. Immediately after that is the precinct name.
Solution: Grab the first line after the first blank line.
Some time after that is a summary thing called "BALLOTS CAST - TOTAL". If this total is zero,
we can sort of kind of assume that no votes are in. We'll need to mark this precinct as in or out later.
The whole header section ends with a blank line after "TURNOUT" or "BALLOTS CAST" in the previous line.
In the actual result section, things get weirder.
Sometimes there's a declaration that something is partisan, like:
************* (DEMOCRATIC) ************ or some such. So if we find such a tag, we can assume there's a partisan
label that we can apply to the race name, like "Supreme Court -- Dem."
There will be more than one party.
Each race has at least one little header row. How do we identify it? The race name is the first line after a blank
line that's not indented as much.
UNITED STATES SENATOR
Vote for NOT MORE THAN 1
BOB CASEY JR (DEM) . . . . . . . . 103 60.95
LOU BARLETTA (REP) . . . . . . . . 64 37.87
How do we know how indented that is? Well, strip off the white space on the left, and compare the length of that
to the length of the full row. If it's the smallest indent, it's one of the header rows.
So then the second row here "Vote for NOT MORE THAN 1" -- we do not care.
But the actual vote counts, we do care. We do want the name and all, we do want the number of votes.
So first we scrape all that stuff.
OOOOK, so what?
Well, we want to export out the scraped data in a standardized format -- specifically, the CSV output from
the Elex package. You can see documentation here:
https://github.com/PalmBeachPost/election-results-parser
If we can dump the data in the right format, we can use election-results-parser to publish it.
"""
# In[ ]:
masterlist = []
racevotes = {}
for precinct in precincts[0:]:
raceparty = ""
racepartytag = ""
lastline = "beer"
precinctid = None
endofheader = None
headerindent = 1000
for i, row in enumerate(precinct):
if not precinctid: # Set precinct ID to be the first thing after the first whitespace
if len(row) > 0 and len(lastline) == 0:
precinctid = row.strip().title()
if "BALLOTS CAST - TOTAL" in row:
ballotscast = numberatend(row)
if ballotscast == 0:
precinctsreporting = 0
else:
precinctsreporting = 1
if len(row) == 0 and ("TURNOUT" in lastline or "BALLOTS CAST" in lastline): # First blank space after header stuff
endofheader = i
break # We're done processing all the header info for this precinct
lastline = row
# Now start parsing actual race results
for row in precinct[endofheader:]:
if len(row) > 0: # Skip blank lines:
partymatch = re.search(r"\*+ \((.*)\) \*+", row) # Find what's between ***s, spaces, parentheses
if partymatch: # If we have something that looks like a party ...
raceparty = partymatch.group(1)
if raceparty == "DEMOCRATIC":
racepartytag = " -- Dem."
elif raceparty == "REPUBLICAN":
racepartytag = " -- Rep."
else:
racepartytag = raceparty.title()
raceparty = raceparty.title() # Title casing
rowindent = len(row)-len(row.lstrip())
# The first line after a blank, without a deeper indent, that's not a party declaration,
# should be the race name. There will be similar indents, like "Vote for three" or whatever.
# But those don't get the race name.
if len(lastline) == 0 and rowindent <= headerindent and "********" not in row:
headerindent = rowindent
contestname = row.strip().title() # + raceparty
# The only other data type we're looking for ... is the actual candidate-level results.
if rowindent > headerindent and "******" not in row: # If we have a date
line = OrderedDict()
peep = fetchname(row)
candidatevotes = fetchvotetally(row)
for item in lineheaders:
line[item] = "" # Generate the right data structure in the right order, then fill it in
line['precinctstotal'] = 1
line['precinctsreporting'] = precinctsreporting
line['precinctsreportingpct'] = precinctsreporting
line['reportingunitid'] = "PA-Beaver County-" + precinctid
line['reportingunitname'] = precinctid
line['officename'] = contestname.split(",")[0].strip()
for kludgeoffice in kludge:
if kludgeoffice in line['officename']:
for kludgepeep in kludge[kludgeoffice]:
if kludgepeep in peep:
line['officename'] = line['officename'] + " (two years)"
if " " not in peep: # Handle single-word candidates like "YES"
line['first'] = peep
line['last'] = ""
else:
line['first'] = peep[:peep.rfind(" ")].strip() # First name is everything until the last space
line['last'] = peep[peep.rfind(" "):].strip() # Last name is everything after the last space
contestname = contestname
line['seatname'] = ", ".join(contestname.split(",")[1:]).strip().replace(" ", " ")
if len(raceparty) > 0:
line['seatname'] = line['seatname'] + racepartytag
line['party'] = raceparty
line['raceid'] = f"{line['officename']}_{line['seatname']}_{line['seatnum']}"
line['id'] = f"PA-Beaver_{line['raceid']}_{line['reportingunitid']}"
line['candidateid'] = f"PA-Beaver_{line['raceid']}_{peep}"
line['lastupdated'] = lastupdated
line['level'] = "subunit"
line['votecount'] = candidatevotes
if line["raceid"] not in racevotes:
racevotes[line["raceid"]] = 0
racevotes[line["raceid"]] += int(line["votecount"])
masterlist.append(line)
lastline = row
for i, line in enumerate(masterlist):
masterlist[i]["electtotal"] = racevotes[line["raceid"]]
if masterlist[i]["electtotal"] == "0":
masterlist[i]["votepct"] = 0
else:
if masterlist[i]['votecount'] == 0:
masterlist[i]['votepct'] = 0
else:
masterlist[i]['votepct'] = Decimal(masterlist[0]['votecount'])/Decimal(masterlist[i]['electtotal'])
# For Elex-CSV, the "pct" is kept at as a decimal, not a percentage. That is, the number ranges from 0 to 1.
# We want to save a snapshot, but we also want to save to the directory where we'll be pulling from
tempfilenames = [filepath + targetfilename, targetdir + targetfilename]
for tempfilename in tempfilenames:
with open(tempfilename, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(lineheaders)
for row in masterlist:
writer.writerow(list(row.values()))
print(f"Done parsing out PA-Beaver with {len(masterlist)} entries.")