Merge pull request #5069 from braykuka/1207-IN-bills-scraper-rewrite-…

…2025 IN: fix event exhibits and minutes url
openstates · Nov 5, 2024 · f7ed0d3 · f7ed0d3
2 parents 7d75b82 + 71b88ed
commit f7ed0d3
Show file tree

Hide file tree

Showing 2 changed files with 97 additions and 102 deletions.
diff --git a/scrapers/in/bills.py b/scrapers/in/bills.py
@@ -71,14 +71,12 @@ def _process_votes(self, rollcalls, bill_id, original_chamber, session):
 
         for r in rollcalls:
             proxy_link = PROXY_BASE_URL + r["link"]
-
             try:
-                (path, resp) = self.urlretrieve(proxy_link)
-            except scrapelib.HTTPError as e:
-                self.warning(e)
-                self.warning(
+                path, _ = self.urlretrieve(proxy_link)
+            except scrapelib.HTTPError:
+                self.logger.warning(
                     "Unable to contact openstates proxy, skipping vote {}".format(
-                        r["link"]
+                        proxy_link
                     )
                 )
                 continue
@@ -275,23 +273,22 @@ def scrape(self, session=None):
 
             try:
                 bill_json = client.get("bill", session=session, bill_link=bill_link)
+                # vehicle bill
+                if not bill_json:
+                    self.logger.warning("Vehicle Bill: {}".format(bill_id))
+                    continue
             except scrapelib.HTTPError:
                 self.logger.warning("Bill could not be accessed. Skipping.")
                 continue
 
-            # vehicle bill
-            if len(list(bill_json.keys())) == 0:
-                self.logger.warning("Vehicle Bill: {}".format(bill_id))
-                continue
-            # sometimes description is blank
-            # if that's the case, we can check to see if
-            # the latest version has a short description
             title = bill_json["description"]
+            # Check if the title is "NoneNone" (indicating a placeholder) and set it to None
             if "NoneNone" in title:
                 title = None
+            # If the title is still empty or None, try to get the short description from the latest version
             if not title:
-                title = bill_json["latestVersion"]["shortDescription"]
-            # and if that doesn't work, use the bill_id but throw a warning
+                title = bill_json["latestVersion"].get("shortDescription")
+            # If the title is still not available, use the bill ID and log a warning
             if not title:
                 title = bill_id
                 self.logger.warning("Bill is missing a title, using bill id instead.")
@@ -314,19 +311,15 @@ def scrape(self, session=None):
             bill.add_source(api_source, note="API details")
 
             # sponsors
-            for s in bill_json["authors"]:
-                self._add_sponsor_if_not_blank(bill, s, classification="author")
-            for s in bill_json["coauthors"]:
-                self._add_sponsor_if_not_blank(bill, s, classification="coauthor")
-            for s in bill_json["sponsors"]:
-                self._add_sponsor_if_not_blank(bill, s, classification="sponsor")
-            for s in bill_json["cosponsors"]:
-                self._add_sponsor_if_not_blank(bill, s, classification="cosponsor")
+            for category in ["authors", "coauthors", "sponsors", "cosponsors"]:
+                for sponsor in bill_json.get(category, []):
+                    self._add_sponsor_if_not_blank(
+                        bill, sponsor, classification=category[:-1]
+                    )
 
             # actions
             action_link = bill_json["actions"]["link"]
             api_source = urljoin(api_base_url, action_link)
-
             try:
                 actions = client.get(
                     "bill_actions", session=session, action_link=action_link
@@ -336,75 +329,84 @@ def scrape(self, session=None):
                 self.logger.warning("Could not find bill actions page")
                 actions = []
 
-            for a in actions:
-                action_desc = a["description"]
+            for action in actions:
+                action_desc = action["description"]
+
+                # Determine action chamber
                 if "governor" in action_desc.lower():
                     action_chamber = "executive"
-                elif a["chamber"]["name"].lower() == "house":
+                elif action["chamber"]["name"].lower() == "house":
                     action_chamber = "lower"
                 else:
                     action_chamber = "upper"
-                date = a["date"]
 
+                # Process action date
+                date = action.get("date")
                 if not date:
                     self.logger.warning("Action has no date, skipping")
                     continue
 
-                # convert time to pupa fuzzy time
-                date = date.replace("T", " ")
-                # TODO: if we update pupa to accept datetimes we can drop this line
-                date = date.split()[0]
+                # Convert date to pupa fuzzy time format
+                date = date.replace("T", " ").split()[0]  # Extract date part only
 
-                d = action_desc.lower()
+                action_desc_lower = action_desc.lower()
                 committee = None
-
                 reading = False
-                attrs = self.categorizer.categorize(action_desc)
-                action_type = attrs["classification"]
-
-                if "first reading" in d:
-                    reading = True
-
-                if "second reading" in d or "reread second time" in d:
-                    reading = True
-
-                if "third reading" in d or "reread third time" in d:
-                    action_type.append("reading-3")
+                action_type = self.categorizer.categorize(action_desc)["classification"]
+
+                # Identify reading actions
+                if any(
+                    phase in action_desc_lower
+                    for phase in [
+                        "first reading",
+                        "second reading",
+                        "third reading",
+                        "reread second time",
+                        "reread third time",
+                    ]
+                ):
                     reading = True
-
-                if "adopted" in d and reading:
+                    if (
+                        "third reading" in action_desc_lower
+                        or "reread third time" in action_desc_lower
+                    ):
+                        action_type.append("reading-3")
+
+                # Mark passage if adopted during reading
+                if "adopted" in action_desc_lower and reading:
                     action_type.append("passage")
 
-                if (
-                    "referred" in d
-                    and "committee on" in d
-                    or "reassigned" in d
-                    and "committee on" in d
-                ):
-                    committee = d.split("committee on")[-1].strip()
+                # Identify related committee
+                if "committee on" in action_desc_lower:
+                    committee = action_desc_lower.split("committee on")[-1].strip()
 
-                a = bill.add_action(
+                # Add action to bill
+                action_instance = bill.add_action(
                     chamber=action_chamber,
                     description=action_desc,
                     date=date,
                     classification=action_type,
                 )
+
+                # Add committee as related entity if present
                 if committee:
-                    a.add_related_entity(committee, entity_type="organization")
-
-            # subjects
-            subjects = [s["entry"] for s in bill_json["latestVersion"]["subjects"]]
-            for subject in subjects:
-                subject = (
-                    subject
-                    if not subject.startswith("PENSIONS AND RETIREMENT BENEFITS")
-                    else "PENSIONS AND RETIREMENT BENEFITS; Public Retirement System (INPRS)"
-                )
+                    action_instance.add_related_entity(
+                        committee, entity_type="organization"
+                    )
+
+            # Extract subjects from the latest version of the bill
+            latest_subjects = bill_json["latestVersion"]["subjects"]
+            for subject_entry in latest_subjects:
+                subject = subject_entry["entry"]
+                if subject.startswith("PENSIONS AND RETIREMENT BENEFITS"):
+                    subject = "PENSIONS AND RETIREMENT BENEFITS; Public Retirement System (INPRS)"
+                # Add the processed subject to the bill
                 bill.add_subject(subject)
 
             # Abstract
-            if bill_json["latestVersion"]["digest"]:
-                bill.add_abstract(bill_json["latestVersion"]["digest"], note="Digest")
+            digest = bill_json["latestVersion"]["digest"]
+            if digest:
+                bill.add_abstract(digest, note="Digest")
 
             # votes
             yield from self._process_votes(
@@ -415,10 +417,13 @@ def scrape(self, session=None):
             )
 
             for v in bill_json["versions"]:
-                # note there are a number of links in the API response that won't work with just a browser, they need an api key
                 # https://iga.in.gov/pdf-documents/123/2024/house/resolutions/HC0001/HC0001.01.INTR.pdf
                 category = "resolutions" if "resolution" in bill_type else "bills"
-                url = f"https://iga.in.gov/pdf-documents/{self.session_no}/{bill_json['year']}/{bill_json['originChamber']}/{category}/{v['billName']}/{v['printVersionName']}.pdf"
+                url = (
+                    f"https://iga.in.gov/pdf-documents/{self.session_no}/"
+                    f"{bill_json['year']}/{bill_json['originChamber']}/"
+                    f"{category}/{v['billName']}/{v['printVersionName']}.pdf"
+                )
                 # PROXY URL
                 # url = urljoin(PROXY_BASE_URL, v['link'])
                 bill.add_version_link(

diff --git a/scrapers/in/events.py b/scrapers/in/events.py
@@ -1,5 +1,4 @@
 import json
-import logging
 import re
 from datetime import date
 from urllib.parse import urljoin
@@ -12,10 +11,6 @@
 from openstates.exceptions import EmptyScrape
 
 
-log = logging.getLogger(__name__)
-PROXY_BASE_URL = "https://in-proxy.openstates.org/"
-
-
 class INEventScraper(Scraper):
     _tz = pytz.timezone("America/Indianapolis")
     base_url = "https://beta-api.iga.in.gov"
@@ -28,9 +23,10 @@ def __init__(self, *args, **kwargs):
     def scrape(self):
         session_no = self.apiclient.get_session_no(self.session)
         response = self.apiclient.get("meetings", session=self.session)
+
         meetings = response["meetings"]
-        if len(meetings["items"]) == 0:
-            raise EmptyScrape
+        if not meetings["items"]:
+            raise EmptyScrape("No meetings found in the response.")
 
         for item in meetings["items"]:
             meeting = self.apiclient.get(
@@ -41,9 +37,6 @@ def scrape(self):
                 continue
 
             committee = meeting["committee"]
-
-            link = urljoin(self.base_url, meeting["link"])
-            _id = link.split("/")[-1]
             committee_name = (
                 committee["name"]
                 .replace(",", "")
@@ -58,19 +51,25 @@ def scrape(self):
             committee_chamber = (
                 committee["chamber"].lower() if committee["chamber"] else "universal"
             )
-            date = meeting["meetingdate"].replace(" ", "")
-            time = meeting["starttime"]
-            if time:
-                time = time.replace(" ", "")
-                when = dateutil.parser.parse(f"{date} {time}")
+
+            link = urljoin(self.base_url, meeting["link"])
+            _id = link.split("/")[-1]
+
+            date_str = meeting["meetingdate"].replace(" ", "")
+            time_str = meeting["starttime"]
+            # Determine the 'when' variable based on the presence of time
+            if time_str:
+                time_str = time_str.replace(
+                    " ", ""
+                )  # Clean up any spaces in the time string
+                when = dateutil.parser.parse(f"{date_str} {time_str}")
                 when = self._tz.localize(when)
                 all_day = False
             else:
-                when = dateutil.parser.parse(date).date()
+                when = dateutil.parser.parse(date_str).date()
                 all_day = True
 
             location = meeting["location"] or "See Agenda"
-
             video_url = (
                 f"https://iga.in.gov/legislative/{self.session}/meeting/watchlive/{_id}"
             )
@@ -100,11 +99,9 @@ def scrape(self):
             event.add_media_link("Video of Hearing", video_url, media_type="text/html")
 
             agendas = meeting["agenda"]
-            if type(agendas) is str:
-                agendas = json.loads(meeting["agenda"])
-            if agendas:
-                agenda = event.add_agenda_item("Bills under consideration")
-
+            if isinstance(agendas, str):
+                agendas = json.loads(agendas)
+            agenda = event.add_agenda_item("Bills under consideration")
             for agenda_item in agendas:
                 if agenda_item.get("bill", None):
                     bill_id = agenda_item["bill"].get("billName")
@@ -114,13 +111,9 @@ def scrape(self):
                     agenda.add_subject(agenda_item["description"])
 
             for exhibit in meeting.get("exhibits"):
-                # Original URL
-                # exhibit_pdf_url = self.apiclient.get_document_url(
-                #     exhibit["pdfDownloadLink"]
-                # )
-                # Proxy URL used because URL provided by API is not directly accessible over the web
-                exhibit_pdf_url = urljoin(PROXY_BASE_URL, exhibit["pdfDownloadLink"])
-                self.logger.info(exhibit_pdf_url)
+                exhibit_pdf_url = self.apiclient.get_document_url(
+                    exhibit["pdfDownloadLink"]
+                )
                 if exhibit_pdf_url:
                     event.add_document(
                         exhibit["description"],
@@ -130,10 +123,7 @@ def scrape(self):
 
             for minute in meeting.get("minutes"):
                 if minute["link"]:
-                    # Original URL
-                    # minute_pdf_url = f"https://iga.in.gov/pdf-documents/{session_no}/{self.session}/{committee_chamber}/committees/{committee_type}/{name_slug}/{_id}/{_id}_minutes.pdf"
-                    # Proxy URL used because URL provided by API is not directly accessible over the web
-                    minute_pdf_url = urljoin(PROXY_BASE_URL, minute["link"])
+                    minute_pdf_url = f"https://iga.in.gov/pdf-documents/{session_no}/{self.session}/{committee_chamber}/committees/{committee_type}/{name_slug}/{_id}/{_id}_minutes.pdf"
                     event.add_document(
                         "Meeting Minutes",
                         minute_pdf_url,