Scrape Englisch RCV list

tillprochaska · tillprochaska · commit aa1de2f2669f · 2025-03-16T23:18:45.000+01:00
Sometime around 2024, the Parliament has stopped including multilingual (French/English/German) titles in the RCV lists. Instead, the French version now includes only the French title. That means that in most cases the displayed title on HowTheyVote.eu is now coming from the OEIL procedure page and not from the RCV list. That works pretty well in general as the procedure info is usually available before the vote takes place, and for most votes, there’s a corresponding OEIL procedure page.

However, in some cases it doesn’t work. This commit fixes that by scraping the English version of the RCV list. We already had a scraper for these some time ago, but weren’t actually using it anymore and the old scraper also doesn’t work with the current structure of the RCV lists, so this is basically a new scraper, just reusing the old name.
diff --git a/backend/howtheyvote/api/votes_api.py b/backend/howtheyvote/api/votes_api.py
@@ -40,10 +40,10 @@
 
 SOURCE_INFO = {
     "RCVListScraper": {
-        "name": "Results of roll-call votes (XML)",
+        "name": "Results of roll-call votes (French, XML)",
     },
     "RCVListEnglishScraper": {
-        "name": "Results of roll-call votes (XML)",
+        "name": "Results of roll-call votes (English, XML)",
     },
     "ProcedureScraper": {
         "name": "Procedure file (Legislative Observatory)",
diff --git a/backend/howtheyvote/cli/pipeline.py b/backend/howtheyvote/cli/pipeline.py
@@ -9,6 +9,7 @@
 from ..pipelines import (
     MembersPipeline,
     PressPipeline,
+    RCVListEnglishPipeline,
     RCVListPipeline,
     SessionsPipeline,
 )
@@ -73,6 +74,16 @@ def rcv_list(term: int, date: datetime.datetime) -> None:
     pipeline.run()
 
 
+@pipeline.command()
+@click.option("--term", type=int, required=True)
+@click.option("--date", type=click.DateTime(formats=["%Y-%m-%d"]), required=True)
+def rcv_list_en(term: int, date: datetime.datetime) -> None:
+    """Run the English RCV lists pipeline for a given day. This scrapes only vote titles,
+    but not the actualy vote results or any other data."""
+    pipeline = RCVListEnglishPipeline(term, date)
+    pipeline.run()
+
+
 @pipeline.command()
 @click.option("--date", type=click.DateTime(formats=["%Y-%m-%d"]), required=True)
 @click.option("--rss", type=bool, is_flag=True)
diff --git a/backend/howtheyvote/pipelines/__init__.py b/backend/howtheyvote/pipelines/__init__.py
@@ -2,11 +2,13 @@
 from .members import MembersPipeline
 from .press import PressPipeline
 from .rcv_list import RCVListPipeline
+from .rcv_list_en import RCVListEnglishPipeline
 from .sessions import SessionsPipeline
 
 __all__ = [
     "PipelineResult",
     "RCVListPipeline",
+    "RCVListEnglishPipeline",
     "PressPipeline",
     "MembersPipeline",
     "SessionsPipeline",
diff --git a/backend/howtheyvote/pipelines/rcv_list_en.py b/backend/howtheyvote/pipelines/rcv_list_en.py
@@ -0,0 +1,49 @@
+import datetime
+from collections.abc import Iterator
+
+from cachetools import LRUCache
+
+from ..models import Vote
+from ..scrapers import RCVListEnglishScraper, RequestCache
+from ..store import Aggregator, BulkWriter, index_records, map_vote
+from .common import BasePipeline
+
+
+class RCVListEnglishPipeline(BasePipeline):
+    """Scrapes the English RCV vote results for a single day. This is a separate pipeline
+    from `RCVListPipeline` (which uses French RCV lists) because the English and French
+    version are often published at different times."""
+
+    def __init__(
+        self,
+        term: int,
+        date: datetime.date,
+        last_run_checksum: str | None = None,
+    ):
+        super().__init__(term=term, date=date)
+        self.term = term
+        self.date = date
+        self._vote_ids: set[str] = set()
+        self._request_cache: RequestCache = LRUCache(maxsize=25)
+
+    def _run(self) -> None:
+        self._scrape_rcv_list()
+        self._index_votes()
+
+    def _scrape_rcv_list(self) -> None:
+        self._log.info("Scraping RCV lists", date=self.date, term=self.term)
+
+        scraper = RCVListEnglishScraper(term=self.term, date=self.date)
+        writer = BulkWriter()
+        writer.add(scraper.run())
+        writer.flush()
+
+        self._vote_ids = writer.get_touched()
+
+    def _index_votes(self) -> None:
+        self._log.info("Indexing votes", date=self.date, term=self.term)
+        index_records(Vote, self._votes())
+
+    def _votes(self) -> Iterator[Vote]:
+        aggregator = Aggregator(Vote)
+        return aggregator.mapped_records(map_func=map_vote, group_keys=self._vote_ids)
diff --git a/backend/howtheyvote/scrapers/votes.py b/backend/howtheyvote/scrapers/votes.py
@@ -1,4 +1,5 @@
 import re
+from collections.abc import Iterator
 from datetime import date, datetime
 from typing import cast
 from urllib.parse import parse_qs, urlparse
@@ -276,6 +277,12 @@ def _text(self, tag: Tag) -> str:
 
 
 class RCVListEnglishScraper(BeautifulSoupScraper):
+    """Since ~2024 the Parliament has stopped including multilingual (French/English/German)
+    titles in the RCV lists. Now, the French version includes only the French title. This
+    scraper is used only to extract English titles for votes. We still use the French version
+    for all other data, as it is the primary language and often available before other
+    translations are published."""
+
     BS_PARSER = "lxml-xml"
     BASE_URL = "https://www.europarl.europa.eu/doceo/document"
 
@@ -290,49 +297,26 @@ def _url(self) -> str:
 
         return url
 
-    def _extract_data(self, doc: BeautifulSoup) -> list[Fragment]:
-        tags = doc.find_all("RollCallVote.Result")
-        fragments = []
-
-        for tag in tags:
-            doceo_vote_id = int(tag["Identifier"])
+    def _extract_data(self, doc: BeautifulSoup) -> Iterator[Fragment]:
+        title_by_dlv_id: dict[int, str] = {}
 
-            text = tag.find("RollCallVote.Description.Text")
-            text = text.text.strip().removeprefix("- ")
-            # timestamp_regex = self._timestamp_regex()
-            # text = re.sub(timestamp_regex, "", text).strip()
-            text = normalize_whitespace(text)
+        for tag in doc.find_all("VoteTitle"):
+            dlv_id = int(tag["DlvId"])
+            title = tag.text.strip()
+            title_by_dlv_id[dlv_id] = title
 
-            title, _, reference, description = parse_rcv_text(
-                text,
-                # The english XML files contain English titles only
-                extract_english=False,
-            )
+        for tag in doc.find_all("RollCallVote.Result"):
+            doceo_vote_id = int(tag["Identifier"])
+            dlv_id = int(tag["DlvId"])
+            title = title_by_dlv_id[dlv_id]
 
-            fragment = self._fragment(
+            yield self._fragment(
                 model=Vote,
                 source_id=doceo_vote_id,
                 group_key=doceo_vote_id,
-                data={
-                    "title_en": title,
-                    "reference": reference,
-                    "description_en": description,
-                },
+                data={"title": title},
             )
 
-            fragments.append(fragment)
-
-        fragments = fill_missing_by_reference(fragments, key="title_en")
-
-        for fragment in fragments:
-            # Reference is only needed temporarily to fill missing titles by reference,
-            # but storing it would be redundant as it is the same in the French lists.
-            fragment.data.pop("reference")
-
-        self._log.info("Extracted English RCV votes", count=len(fragments))
-
-        return fragments
-
 
 class DocumentScraper(BeautifulSoupScraper):
     BS_PARSER = "lxml"
diff --git a/backend/tests/scrapers/data/votes/rcv_list_pv-10-2024-09-18-rcv-en.xml b/backend/tests/scrapers/data/votes/rcv_list_pv-10-2024-09-18-rcv-en.xml
diff --git a/backend/tests/scrapers/test_votes.py b/backend/tests/scrapers/test_votes.py
@@ -8,6 +8,7 @@
     EurlexDocumentScraper,
     EurlexProcedureScraper,
     ProcedureScraper,
+    RCVListEnglishScraper,
     RCVListScraper,
 )
 
@@ -230,6 +231,43 @@ def test_rcv_list_scraper_timestamp_from_text(responses):
     assert data.get("description") is None
 
 
+def test_rcv_list_english_scraper(responses):
+    responses.get(
+        "https://www.europarl.europa.eu/doceo/document/PV-10-2024-09-18-RCV_EN.xml",
+        body=load_fixture("scrapers/data/votes/rcv_list_pv-10-2024-09-18-rcv-en.xml"),
+    )
+
+    scraper = RCVListEnglishScraper(term=10, date=datetime.date(2024, 9, 18))
+    data = list(scraper.run())
+
+    expected = [
+        Fragment(
+            model="Vote",
+            source_name="RCVListEnglishScraper",
+            source_id=169418,
+            group_key=169418,
+            source_url="https://www.europarl.europa.eu/doceo/document/PV-10-2024-09-18-RCV_EN.xml",
+            data={
+                "title": "Objection pursuant to Rule 115(2) and (3), and Rule 115(4)(c): Maximum residue levels for carbendazim and thiophanate-methyl",
+            },
+        ),
+        Fragment(
+            model="Vote",
+            source_name="RCVListEnglishScraper",
+            source_id=169419,
+            group_key=169419,
+            source_url="https://www.europarl.europa.eu/doceo/document/PV-10-2024-09-18-RCV_EN.xml",
+            data={
+                "title": "Objection pursuant to Rule 115(2) and (3), and Rule 115(4)(c): Maximum residue levels for cyproconazole",
+            },
+        ),
+    ]
+
+    assert len(data) == 2
+    assert record_to_dict(data[0]) == record_to_dict(expected[0])
+    assert record_to_dict(data[1]) == record_to_dict(expected[1])
+
+
 def test_procedure_scraper(responses):
     responses.get(
         "https://oeil.secure.europarl.europa.eu/oeil/en/procedure-file?reference=2023/2019(INI)",