Skip to content

Commit

Permalink
Merge pull request #5069 from braykuka/1207-IN-bills-scraper-rewrite-…
Browse files Browse the repository at this point in the history
…2025

IN: fix event exhibits and minutes url
  • Loading branch information
jessemortenson authored Nov 5, 2024
2 parents 7d75b82 + 71b88ed commit f7ed0d3
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 102 deletions.
143 changes: 74 additions & 69 deletions scrapers/in/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,12 @@ def _process_votes(self, rollcalls, bill_id, original_chamber, session):

for r in rollcalls:
proxy_link = PROXY_BASE_URL + r["link"]

try:
(path, resp) = self.urlretrieve(proxy_link)
except scrapelib.HTTPError as e:
self.warning(e)
self.warning(
path, _ = self.urlretrieve(proxy_link)
except scrapelib.HTTPError:
self.logger.warning(
"Unable to contact openstates proxy, skipping vote {}".format(
r["link"]
proxy_link
)
)
continue
Expand Down Expand Up @@ -275,23 +273,22 @@ def scrape(self, session=None):

try:
bill_json = client.get("bill", session=session, bill_link=bill_link)
# vehicle bill
if not bill_json:
self.logger.warning("Vehicle Bill: {}".format(bill_id))
continue
except scrapelib.HTTPError:
self.logger.warning("Bill could not be accessed. Skipping.")
continue

# vehicle bill
if len(list(bill_json.keys())) == 0:
self.logger.warning("Vehicle Bill: {}".format(bill_id))
continue
# sometimes description is blank
# if that's the case, we can check to see if
# the latest version has a short description
title = bill_json["description"]
# Check if the title is "NoneNone" (indicating a placeholder) and set it to None
if "NoneNone" in title:
title = None
# If the title is still empty or None, try to get the short description from the latest version
if not title:
title = bill_json["latestVersion"]["shortDescription"]
# and if that doesn't work, use the bill_id but throw a warning
title = bill_json["latestVersion"].get("shortDescription")
# If the title is still not available, use the bill ID and log a warning
if not title:
title = bill_id
self.logger.warning("Bill is missing a title, using bill id instead.")
Expand All @@ -314,19 +311,15 @@ def scrape(self, session=None):
bill.add_source(api_source, note="API details")

# sponsors
for s in bill_json["authors"]:
self._add_sponsor_if_not_blank(bill, s, classification="author")
for s in bill_json["coauthors"]:
self._add_sponsor_if_not_blank(bill, s, classification="coauthor")
for s in bill_json["sponsors"]:
self._add_sponsor_if_not_blank(bill, s, classification="sponsor")
for s in bill_json["cosponsors"]:
self._add_sponsor_if_not_blank(bill, s, classification="cosponsor")
for category in ["authors", "coauthors", "sponsors", "cosponsors"]:
for sponsor in bill_json.get(category, []):
self._add_sponsor_if_not_blank(
bill, sponsor, classification=category[:-1]
)

# actions
action_link = bill_json["actions"]["link"]
api_source = urljoin(api_base_url, action_link)

try:
actions = client.get(
"bill_actions", session=session, action_link=action_link
Expand All @@ -336,75 +329,84 @@ def scrape(self, session=None):
self.logger.warning("Could not find bill actions page")
actions = []

for a in actions:
action_desc = a["description"]
for action in actions:
action_desc = action["description"]

# Determine action chamber
if "governor" in action_desc.lower():
action_chamber = "executive"
elif a["chamber"]["name"].lower() == "house":
elif action["chamber"]["name"].lower() == "house":
action_chamber = "lower"
else:
action_chamber = "upper"
date = a["date"]

# Process action date
date = action.get("date")
if not date:
self.logger.warning("Action has no date, skipping")
continue

# convert time to pupa fuzzy time
date = date.replace("T", " ")
# TODO: if we update pupa to accept datetimes we can drop this line
date = date.split()[0]
# Convert date to pupa fuzzy time format
date = date.replace("T", " ").split()[0] # Extract date part only

d = action_desc.lower()
action_desc_lower = action_desc.lower()
committee = None

reading = False
attrs = self.categorizer.categorize(action_desc)
action_type = attrs["classification"]

if "first reading" in d:
reading = True

if "second reading" in d or "reread second time" in d:
reading = True

if "third reading" in d or "reread third time" in d:
action_type.append("reading-3")
action_type = self.categorizer.categorize(action_desc)["classification"]

# Identify reading actions
if any(
phase in action_desc_lower
for phase in [
"first reading",
"second reading",
"third reading",
"reread second time",
"reread third time",
]
):
reading = True

if "adopted" in d and reading:
if (
"third reading" in action_desc_lower
or "reread third time" in action_desc_lower
):
action_type.append("reading-3")

# Mark passage if adopted during reading
if "adopted" in action_desc_lower and reading:
action_type.append("passage")

if (
"referred" in d
and "committee on" in d
or "reassigned" in d
and "committee on" in d
):
committee = d.split("committee on")[-1].strip()
# Identify related committee
if "committee on" in action_desc_lower:
committee = action_desc_lower.split("committee on")[-1].strip()

a = bill.add_action(
# Add action to bill
action_instance = bill.add_action(
chamber=action_chamber,
description=action_desc,
date=date,
classification=action_type,
)

# Add committee as related entity if present
if committee:
a.add_related_entity(committee, entity_type="organization")

# subjects
subjects = [s["entry"] for s in bill_json["latestVersion"]["subjects"]]
for subject in subjects:
subject = (
subject
if not subject.startswith("PENSIONS AND RETIREMENT BENEFITS")
else "PENSIONS AND RETIREMENT BENEFITS; Public Retirement System (INPRS)"
)
action_instance.add_related_entity(
committee, entity_type="organization"
)

# Extract subjects from the latest version of the bill
latest_subjects = bill_json["latestVersion"]["subjects"]
for subject_entry in latest_subjects:
subject = subject_entry["entry"]
if subject.startswith("PENSIONS AND RETIREMENT BENEFITS"):
subject = "PENSIONS AND RETIREMENT BENEFITS; Public Retirement System (INPRS)"
# Add the processed subject to the bill
bill.add_subject(subject)

# Abstract
if bill_json["latestVersion"]["digest"]:
bill.add_abstract(bill_json["latestVersion"]["digest"], note="Digest")
digest = bill_json["latestVersion"]["digest"]
if digest:
bill.add_abstract(digest, note="Digest")

# votes
yield from self._process_votes(
Expand All @@ -415,10 +417,13 @@ def scrape(self, session=None):
)

for v in bill_json["versions"]:
# note there are a number of links in the API response that won't work with just a browser, they need an api key
# https://iga.in.gov/pdf-documents/123/2024/house/resolutions/HC0001/HC0001.01.INTR.pdf
category = "resolutions" if "resolution" in bill_type else "bills"
url = f"https://iga.in.gov/pdf-documents/{self.session_no}/{bill_json['year']}/{bill_json['originChamber']}/{category}/{v['billName']}/{v['printVersionName']}.pdf"
url = (
f"https://iga.in.gov/pdf-documents/{self.session_no}/"
f"{bill_json['year']}/{bill_json['originChamber']}/"
f"{category}/{v['billName']}/{v['printVersionName']}.pdf"
)
# PROXY URL
# url = urljoin(PROXY_BASE_URL, v['link'])
bill.add_version_link(
Expand Down
56 changes: 23 additions & 33 deletions scrapers/in/events.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import json
import logging
import re
from datetime import date
from urllib.parse import urljoin
Expand All @@ -12,10 +11,6 @@
from openstates.exceptions import EmptyScrape


log = logging.getLogger(__name__)
PROXY_BASE_URL = "https://in-proxy.openstates.org/"


class INEventScraper(Scraper):
_tz = pytz.timezone("America/Indianapolis")
base_url = "https://beta-api.iga.in.gov"
Expand All @@ -28,9 +23,10 @@ def __init__(self, *args, **kwargs):
def scrape(self):
session_no = self.apiclient.get_session_no(self.session)
response = self.apiclient.get("meetings", session=self.session)

meetings = response["meetings"]
if len(meetings["items"]) == 0:
raise EmptyScrape
if not meetings["items"]:
raise EmptyScrape("No meetings found in the response.")

for item in meetings["items"]:
meeting = self.apiclient.get(
Expand All @@ -41,9 +37,6 @@ def scrape(self):
continue

committee = meeting["committee"]

link = urljoin(self.base_url, meeting["link"])
_id = link.split("/")[-1]
committee_name = (
committee["name"]
.replace(",", "")
Expand All @@ -58,19 +51,25 @@ def scrape(self):
committee_chamber = (
committee["chamber"].lower() if committee["chamber"] else "universal"
)
date = meeting["meetingdate"].replace(" ", "")
time = meeting["starttime"]
if time:
time = time.replace(" ", "")
when = dateutil.parser.parse(f"{date} {time}")

link = urljoin(self.base_url, meeting["link"])
_id = link.split("/")[-1]

date_str = meeting["meetingdate"].replace(" ", "")
time_str = meeting["starttime"]
# Determine the 'when' variable based on the presence of time
if time_str:
time_str = time_str.replace(
" ", ""
) # Clean up any spaces in the time string
when = dateutil.parser.parse(f"{date_str} {time_str}")
when = self._tz.localize(when)
all_day = False
else:
when = dateutil.parser.parse(date).date()
when = dateutil.parser.parse(date_str).date()
all_day = True

location = meeting["location"] or "See Agenda"

video_url = (
f"https://iga.in.gov/legislative/{self.session}/meeting/watchlive/{_id}"
)
Expand Down Expand Up @@ -100,11 +99,9 @@ def scrape(self):
event.add_media_link("Video of Hearing", video_url, media_type="text/html")

agendas = meeting["agenda"]
if type(agendas) is str:
agendas = json.loads(meeting["agenda"])
if agendas:
agenda = event.add_agenda_item("Bills under consideration")

if isinstance(agendas, str):
agendas = json.loads(agendas)
agenda = event.add_agenda_item("Bills under consideration")
for agenda_item in agendas:
if agenda_item.get("bill", None):
bill_id = agenda_item["bill"].get("billName")
Expand All @@ -114,13 +111,9 @@ def scrape(self):
agenda.add_subject(agenda_item["description"])

for exhibit in meeting.get("exhibits"):
# Original URL
# exhibit_pdf_url = self.apiclient.get_document_url(
# exhibit["pdfDownloadLink"]
# )
# Proxy URL used because URL provided by API is not directly accessible over the web
exhibit_pdf_url = urljoin(PROXY_BASE_URL, exhibit["pdfDownloadLink"])
self.logger.info(exhibit_pdf_url)
exhibit_pdf_url = self.apiclient.get_document_url(
exhibit["pdfDownloadLink"]
)
if exhibit_pdf_url:
event.add_document(
exhibit["description"],
Expand All @@ -130,10 +123,7 @@ def scrape(self):

for minute in meeting.get("minutes"):
if minute["link"]:
# Original URL
# minute_pdf_url = f"https://iga.in.gov/pdf-documents/{session_no}/{self.session}/{committee_chamber}/committees/{committee_type}/{name_slug}/{_id}/{_id}_minutes.pdf"
# Proxy URL used because URL provided by API is not directly accessible over the web
minute_pdf_url = urljoin(PROXY_BASE_URL, minute["link"])
minute_pdf_url = f"https://iga.in.gov/pdf-documents/{session_no}/{self.session}/{committee_chamber}/committees/{committee_type}/{name_slug}/{_id}/{_id}_minutes.pdf"
event.add_document(
"Meeting Minutes",
minute_pdf_url,
Expand Down

0 comments on commit f7ed0d3

Please sign in to comment.