Skip to content

Commit

Permalink
replace the bioguide scraper with one that can do a deep parse of bio…
Browse files Browse the repository at this point in the history
…guide entries using a context free grammar
  • Loading branch information
JoshData committed Aug 2, 2015
1 parent cdca839 commit 5cf842f
Show file tree
Hide file tree
Showing 4 changed files with 661 additions and 190 deletions.
283 changes: 94 additions & 189 deletions scripts/bioguide.py
Original file line number Diff line number Diff line change
@@ -1,216 +1,95 @@
#!/usr/bin/env python

# gets fundamental information for every member with a bioguide ID:
# first name, nickname, middle name, last name, name suffix
# birthday
# Updates our database using a deep parse of the bioguide.

# options:
# --cache: load from cache if present on disk (default: true)
# --current: do *only* current legislators (default: true)
# --historical: do *only* historical legislators (default: false)
# --bioguide: do *only* a single legislator
# --relationships: Get familial relationships to other members of congress past and present, when applicable
# --cache: load bioguide from cache if present on disk (default: true)
# --bioguide X000000: do *only* a single legislator

import lxml.html, io
import datetime
import re
import utils
from utils import download, load_data, save_data

def run():

def update_birthday(bioguide, person, main):

birthday = birthday_for(main)
if not birthday:
print("[%s] NO BIRTHDAY :(\n\n%s" % (bioguide, main.encode("utf8")))
warnings.append(bioguide)
return
if birthday == "UNKNOWN":
return

try:
birthday = datetime.datetime.strptime(birthday.replace(",", ""), "%B %d %Y")
except ValueError:
print("[%s] BAD BIRTHDAY :(\n\n%s" % (bioguide, main.encode("utf8")))
warnings.append(bioguide)
return

birthday = "%04d-%02d-%02d" % (birthday.year, birthday.month, birthday.day)
person.setdefault("bio", {})["birthday"] = birthday


def birthday_for(string):
# exceptions for not-nicely-placed semicolons
string = string.replace("born in Cresskill, Bergen County, N. J.; April", "born April")
string = string.replace("FOSTER, A. Lawrence, a Representative from New York; September 17, 1802;", "born September 17, 1802")
string = string.replace("CAO, Anh (Joseph), a Representative from Louisiana; born in Ho Chi Minh City, Vietnam; March 13, 1967", "born March 13, 1967")
string = string.replace("CRITZ, Mark S., a Representative from Pennsylvania; born in Irwin, Westmoreland County, Pa.; January 5, 1962;", "born January 5, 1962")
string = string.replace("SCHIFF, Steven Harvey, a Representative from New Mexico; born in Chicago, Ill.; March 18, 1947", "born March 18, 1947")
string = string.replace('KRATOVIL, Frank, M. Jr., a Representative from Maryland; born in Lanham, Prince George\u2019s County, Md.; May 29, 1968', "born May 29, 1968")

# look for a date
pattern = r"born [^;]*?((?:January|February|March|April|May|June|July|August|September|October|November|December),? \d{1,2},? \d{4})"
match = re.search(pattern, string, re.I)
if not match or not match.group(1):
# specifically detect cases that we can't handle to avoid unnecessary warnings
if re.search("birth dates? unknown|date of birth is unknown", string, re.I): return "UNKNOWN"
if re.search("born [^;]*?(?:in|about|before )?(?:(?:January|February|March|April|May|June|July|August|September|October|November|December) )?\d{4}", string, re.I): return "UNKNOWN"
return None
return match.group(1).strip()

def relationships_of(string):
# relationship data is stored in a parenthetical immediately after the end of the </font> tag in the bio
# e.g. "(son of Joseph Patrick Kennedy, II, and great-nephew of Edward Moore Kennedy and John Fitzgerald Kennedy)"
pattern = "^\((.*?)\)"
match = re.search(pattern, string, re.I)

relationships = []

if match and len(match.groups()) > 0:
relationship_text = match.group(1).encode("ascii", "replace")

# since some relationships refer to multiple people--great-nephew of Edward Moore Kennedy AND John Fitzgerald Kennedy--we need a special grammar
from nltk import tree, pos_tag, RegexpParser
tokens = re.split("[ ,;]+|-(?![0-9])", relationship_text)
pos = pos_tag(tokens)

grammar = r"""
NAME: {<NNP>+}
NAMES: { <IN><NAME>(?:<CC><NAME>)* }
RELATIONSHIP: { <JJ|NN|RB|VB|VBD|VBN|IN|PRP\$>+ }
MATCH: { <RELATIONSHIP><NAMES> }
"""
cp = RegexpParser(grammar)
chunks = cp.parse(pos)

# iterate through the Relationship/Names pairs
for n in chunks:
if isinstance(n, tree.Tree) and n.node == "MATCH":
people = []
relationship = None
for piece in n:
if piece.node == "RELATIONSHIP":
relationship = " ".join([x[0] for x in piece])
elif piece.node == "NAMES":
for name in [x for x in piece if isinstance(x, tree.Tree)]:
people.append(" ".join([x[0] for x in name]))
for person in people:
relationships.append({ "relation": relationship, "name": person})
return relationships

# default to caching
cache = utils.flags().get('cache', True)
force = not cache

# pick either current or historical
# order is important here, since current defaults to true
if utils.flags().get('historical', False):
filename = "legislators-historical.yaml"
elif utils.flags().get('current', True):
filename = "legislators-current.yaml"
else:
print("No legislators selected.")
exit(0)

print("Loading %s..." % filename)
legislators = load_data(filename)


# reoriented cache to access by bioguide ID
by_bioguide = { }
for m in legislators:
if "bioguide" in m["id"]:
by_bioguide[m["id"]["bioguide"]] = m
from bioguide2 import parse_bioguide_entry

def run():
# Fetch the bioguide. Hits the network if the cache of the bioguide
# isn't present yet, or if --cache=False is set.
one_bioguide, bioguide_entries = download_the_bioguide()

# optionally focus on one legislator
# Do a deep parse on the bioguide.
parse_the_bioguide(bioguide_entries)

bioguide = utils.flags().get('bioguide', None)
if bioguide:
bioguides = [bioguide]
# Save result.
if not one_bioguide:
# Save a cached file if we aren't just parsing one record.
save_data(bioguide_entries, "bioguide-parsed.yaml")
else:
bioguides = list(by_bioguide.keys())

warnings = []
missing = []
count = 0
families = 0

for bioguide in bioguides:
# Download & parse the HTML of the bioguide page.
try:
dom = fetch_bioguide_page(bioguide, force)
except Exception as e:
print(e)
missing.append(bioguide)
continue

# Extract the member's name and the biography paragraph (main).

try:
name = dom.cssselect("p font")[0]
main = dom.cssselect("p")[0]
except IndexError:
print("[%s] Missing name or content!" % bioguide)
exit(0)

name = name.text_content().strip()
main = main.text_content().strip().replace("\n", " ").replace("\r", " ")
main = re.sub("\s+", " ", main)
import rtyaml
print(one_bioguide)
print(rtyaml.dump(bioguide_entries[one_bioguide]))

# Extract the member's birthday.

update_birthday(bioguide, by_bioguide[bioguide], main)

# Extract relationships with other Members of Congress.

if utils.flags().get("relationships", False):
#relationship information, if present, is in a parenthetical immediately after the name.
#should always be present if we passed the IndexError catch above
after_name = dom.cssselect("p font")[0].tail.strip()
relationships = relationships_of(after_name)
if len(relationships):
families = families + 1
by_bioguide[bioguide]["family"] = relationships

count = count + 1


print()
if warnings:
print("Missed %d birthdays: %s" % (len(warnings), str.join(", ", warnings)))

if missing:
print("Missing a page for %d bioguides: %s" % (len(missing), str.join(", ", missing)))

print("Saving data to %s..." % filename)
save_data(legislators, filename)

print("Saved %d legislators to %s" % (count, filename))

if utils.flags().get("relationships", False):
print("Found family members for %d of those legislators" % families)

# Some testing code to help isolate and fix issued:
# f
# none = "PEARSON, Joseph, a Representative from North Carolina; born in Rowan County, N.C., in 1776; completed preparatory studies; studied law; was admitted to the bar and commenced practice in Salisbury, N.C.; member of the State house of commons; elected as a Federalist to the Eleventh, Twelfth, and Thirteenth Congresses (March 4, 1809-March 3, 1815); while in Congress fought a duel with John George Jackson, of Virginia, and on the second fire wounded his opponent in the hip; died in Salisbury, N.C., October 27, 1834."
# print "Pearson (none): %s" % birthday_for(none)

# owens = "OWENS, William, a Representative from New York; born in Brooklyn, Kings County, N.Y., January, 20, 1949; B.S., Manhattan College, Riverdale, N.Y., 1971; J.D., Fordham University, New York, N.Y., 1974; United States Air Force; lawyer, private practice; faculty, State University of New York, Plattsburgh, N.Y., 1978-1986; elected as a Democrat to the One Hundred Eleventh Congress, by special election to fill the vacancy caused by the resignation of United States Representative John McHugh, and reelected to the two succeeding Congresses (November 3, 2009-present)."
# print "Owens (January, 20, 1949): %s" % birthday_for(owens)

# shea = "SHEA-PORTER, Carol, a Representative from New Hampshire; born in New York City, New York County, N.Y., December, 1952; graduated from Oyster River High School, Durham, N.H., 1971; B.A., University of New Hampshire, Durham, N.H., 1975; M.P.A., University of New Hampshire, Durham, N.H., 1979; social worker; professor; elected as a Democrat to the One Hundred Tenth Congress and to the succeeding Congress (January 3, 2007-January 3, 2011); unsuccessful candidate for reelection to the One Hundred Twelfth Congress in 2010; elected as a Democrat to the One Hundred Thirteenth Congress (January 3, 2013-present)."
# print "Shea (none): %s" % birthday_for(shea)
def download_the_bioguide():
# default to caching
cache = utils.flags().get('cache', True)
force = not cache

# control = "PEARSON, Richmond, a Representative from North Carolina; born at Richmond Hill, Yadkin County, N.C., January 26, 1852; attended Horner's School, Oxford, N.C., and was graduated from Princeton College in 1872; studied law; was admitted to the bar in 1874; in the same year was appointed United States consul to Verviers and Liege, Belgium; resigned in 1877; member of the State house of representatives 1884-1886; elected as a Republican to the Fifty-fourth and Fifty-fifth Congresses (March 4, 1895-March 3, 1899); successfully contested the election of William T. Crawford to the Fifty-sixth Congress and served from May 10, 1900, to March 3, 1901; appointed by President Theodore Roosevelt as United States consul to Genoa, Italy, December 11, 1901, as Envoy Extraordinary and Minister Plenipotentiary to Persia in 1902, and as Minister to Greece and Montenegro in 1907; resigned from the diplomatic service in 1909; died at Richmond Hill, Asheville, N.C., September 12, 1923; interment in Riverside Cemetery."
# print "\nControl (January 26, 1852): %s" % birthday_for(control)
bioguide_entries = { }
for filename in ("legislators-historical.yaml", "legislators-current.yaml"):
print("Fetching bioguide entries for legislators in %s..." % filename)
legislators = load_data(filename)

# reoriented cache to access by bioguide ID
by_bioguide = { }
for m in legislators:
if "bioguide" in m["id"]:
by_bioguide[m["id"]["bioguide"]] = m

# optionally focus on one legislator
one_bioguide = utils.flags().get('bioguide', None)
if one_bioguide:
if one_bioguide not in by_bioguide:
continue
bioguides = [one_bioguide]
else:
bioguides = sorted(by_bioguide.keys())

# Download & parse the HTML of the bioguide pages.
for bioguide in bioguides:
try:
dom = fetch_bioguide_page(bioguide, force)
except Exception as e:
print(e)
continue

# Extract the member's name and the biography paragraph.
try:
name = dom.cssselect("p font")[0]
biography = dom.cssselect("p")[0]
except IndexError:
print("[%s] Missing name or content!" % bioguide)
continue

name = name.text_content().strip().rstrip(',')
biography = biography.text_content().strip().replace("\n", " ").replace("\r", " ")
biography = re.sub("\s+", " ", biography)

bioguide_entries[bioguide] = {
"name": name,
"text": biography,
}

return one_bioguide, bioguide_entries

def fetch_bioguide_page(bioguide, force):
url = "http://bioguide.congress.gov/scripts/biodisplay.pl?index=%s" % bioguide
cache = "legislators/bioguide/%s.html" % bioguide
try:
body = download(url, cache, force)
body = download(url, cache, force, options={ "log_downloads": True })

# Fix a problem?
body = body.replace("&Aacute;\xc2\x81", "&Aacute;")
Expand All @@ -232,5 +111,31 @@ def fetch_bioguide_page(bioguide, force):

return dom

def parse_the_bioguide(bioguide_entries):
# Parse the bioguide entries using our modgrammar grammar.
# This part is slow and CPU-bound, so use a pool of workers.

from multiprocessing import Pool

with Pool() as pool:
# Queue up all of the tasks.
tasks = { }
for bioguide in sorted(bioguide_entries):
# Queue up a call to parse_bioguide_entry. This returns an
# AsyncResult which lets us check later if the call completed.
ar = pool.apply_async(
parse_bioguide_entry,
[bioguide_entries[bioguide]['name'], bioguide_entries[bioguide]['text']])
tasks[bioguide] = ar

# Wait for all of the tasks to complete and store the results
# in the main dict.
for bioguide, ar in sorted(tasks.items()):
print(bioguide, bioguide_entries[bioguide]['name'], '...')
parsed_info = ar.get()
bioguide_entries[bioguide].update(parsed_info)



if __name__ == '__main__':
run()
Loading

0 comments on commit 5cf842f

Please sign in to comment.