Skip to content

Commit aa1de2f

Browse files
committed
Scrape Englisch RCV list
Sometime around 2024, the Parliament has stopped including multilingual (French/English/German) titles in the RCV lists. Instead, the French version now includes only the French title. That means that in most cases the displayed title on HowTheyVote.eu is now coming from the OEIL procedure page and not from the RCV list. That works pretty well in general as the procedure info is usually available before the vote takes place, and for most votes, there’s a corresponding OEIL procedure page. However, in some cases it doesn’t work. This commit fixes that by scraping the English version of the RCV list. We already had a scraper for these some time ago, but weren’t actually using it anymore and the old scraper also doesn’t work with the current structure of the RCV lists, so this is basically a new scraper, just reusing the old name.
1 parent 0db42ef commit aa1de2f

File tree

7 files changed

+137
-37
lines changed

7 files changed

+137
-37
lines changed

backend/howtheyvote/api/votes_api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,10 @@
4040

4141
SOURCE_INFO = {
4242
"RCVListScraper": {
43-
"name": "Results of roll-call votes (XML)",
43+
"name": "Results of roll-call votes (French, XML)",
4444
},
4545
"RCVListEnglishScraper": {
46-
"name": "Results of roll-call votes (XML)",
46+
"name": "Results of roll-call votes (English, XML)",
4747
},
4848
"ProcedureScraper": {
4949
"name": "Procedure file (Legislative Observatory)",

backend/howtheyvote/cli/pipeline.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from ..pipelines import (
1010
MembersPipeline,
1111
PressPipeline,
12+
RCVListEnglishPipeline,
1213
RCVListPipeline,
1314
SessionsPipeline,
1415
)
@@ -73,6 +74,16 @@ def rcv_list(term: int, date: datetime.datetime) -> None:
7374
pipeline.run()
7475

7576

77+
@pipeline.command()
78+
@click.option("--term", type=int, required=True)
79+
@click.option("--date", type=click.DateTime(formats=["%Y-%m-%d"]), required=True)
80+
def rcv_list_en(term: int, date: datetime.datetime) -> None:
81+
"""Run the English RCV lists pipeline for a given day. This scrapes only vote titles,
82+
but not the actualy vote results or any other data."""
83+
pipeline = RCVListEnglishPipeline(term, date)
84+
pipeline.run()
85+
86+
7687
@pipeline.command()
7788
@click.option("--date", type=click.DateTime(formats=["%Y-%m-%d"]), required=True)
7889
@click.option("--rss", type=bool, is_flag=True)

backend/howtheyvote/pipelines/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@
22
from .members import MembersPipeline
33
from .press import PressPipeline
44
from .rcv_list import RCVListPipeline
5+
from .rcv_list_en import RCVListEnglishPipeline
56
from .sessions import SessionsPipeline
67

78
__all__ = [
89
"PipelineResult",
910
"RCVListPipeline",
11+
"RCVListEnglishPipeline",
1012
"PressPipeline",
1113
"MembersPipeline",
1214
"SessionsPipeline",
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import datetime
2+
from collections.abc import Iterator
3+
4+
from cachetools import LRUCache
5+
6+
from ..models import Vote
7+
from ..scrapers import RCVListEnglishScraper, RequestCache
8+
from ..store import Aggregator, BulkWriter, index_records, map_vote
9+
from .common import BasePipeline
10+
11+
12+
class RCVListEnglishPipeline(BasePipeline):
13+
"""Scrapes the English RCV vote results for a single day. This is a separate pipeline
14+
from `RCVListPipeline` (which uses French RCV lists) because the English and French
15+
version are often published at different times."""
16+
17+
def __init__(
18+
self,
19+
term: int,
20+
date: datetime.date,
21+
last_run_checksum: str | None = None,
22+
):
23+
super().__init__(term=term, date=date)
24+
self.term = term
25+
self.date = date
26+
self._vote_ids: set[str] = set()
27+
self._request_cache: RequestCache = LRUCache(maxsize=25)
28+
29+
def _run(self) -> None:
30+
self._scrape_rcv_list()
31+
self._index_votes()
32+
33+
def _scrape_rcv_list(self) -> None:
34+
self._log.info("Scraping RCV lists", date=self.date, term=self.term)
35+
36+
scraper = RCVListEnglishScraper(term=self.term, date=self.date)
37+
writer = BulkWriter()
38+
writer.add(scraper.run())
39+
writer.flush()
40+
41+
self._vote_ids = writer.get_touched()
42+
43+
def _index_votes(self) -> None:
44+
self._log.info("Indexing votes", date=self.date, term=self.term)
45+
index_records(Vote, self._votes())
46+
47+
def _votes(self) -> Iterator[Vote]:
48+
aggregator = Aggregator(Vote)
49+
return aggregator.mapped_records(map_func=map_vote, group_keys=self._vote_ids)

backend/howtheyvote/scrapers/votes.py

Lines changed: 19 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import re
2+
from collections.abc import Iterator
23
from datetime import date, datetime
34
from typing import cast
45
from urllib.parse import parse_qs, urlparse
@@ -276,6 +277,12 @@ def _text(self, tag: Tag) -> str:
276277

277278

278279
class RCVListEnglishScraper(BeautifulSoupScraper):
280+
"""Since ~2024 the Parliament has stopped including multilingual (French/English/German)
281+
titles in the RCV lists. Now, the French version includes only the French title. This
282+
scraper is used only to extract English titles for votes. We still use the French version
283+
for all other data, as it is the primary language and often available before other
284+
translations are published."""
285+
279286
BS_PARSER = "lxml-xml"
280287
BASE_URL = "https://www.europarl.europa.eu/doceo/document"
281288

@@ -290,49 +297,26 @@ def _url(self) -> str:
290297

291298
return url
292299

293-
def _extract_data(self, doc: BeautifulSoup) -> list[Fragment]:
294-
tags = doc.find_all("RollCallVote.Result")
295-
fragments = []
296-
297-
for tag in tags:
298-
doceo_vote_id = int(tag["Identifier"])
300+
def _extract_data(self, doc: BeautifulSoup) -> Iterator[Fragment]:
301+
title_by_dlv_id: dict[int, str] = {}
299302

300-
text = tag.find("RollCallVote.Description.Text")
301-
text = text.text.strip().removeprefix("- ")
302-
# timestamp_regex = self._timestamp_regex()
303-
# text = re.sub(timestamp_regex, "", text).strip()
304-
text = normalize_whitespace(text)
303+
for tag in doc.find_all("VoteTitle"):
304+
dlv_id = int(tag["DlvId"])
305+
title = tag.text.strip()
306+
title_by_dlv_id[dlv_id] = title
305307

306-
title, _, reference, description = parse_rcv_text(
307-
text,
308-
# The english XML files contain English titles only
309-
extract_english=False,
310-
)
308+
for tag in doc.find_all("RollCallVote.Result"):
309+
doceo_vote_id = int(tag["Identifier"])
310+
dlv_id = int(tag["DlvId"])
311+
title = title_by_dlv_id[dlv_id]
311312

312-
fragment = self._fragment(
313+
yield self._fragment(
313314
model=Vote,
314315
source_id=doceo_vote_id,
315316
group_key=doceo_vote_id,
316-
data={
317-
"title_en": title,
318-
"reference": reference,
319-
"description_en": description,
320-
},
317+
data={"title": title},
321318
)
322319

323-
fragments.append(fragment)
324-
325-
fragments = fill_missing_by_reference(fragments, key="title_en")
326-
327-
for fragment in fragments:
328-
# Reference is only needed temporarily to fill missing titles by reference,
329-
# but storing it would be redundant as it is the same in the French lists.
330-
fragment.data.pop("reference")
331-
332-
self._log.info("Extracted English RCV votes", count=len(fragments))
333-
334-
return fragments
335-
336320

337321
class DocumentScraper(BeautifulSoupScraper):
338322
BS_PARSER = "lxml"

backend/tests/scrapers/data/votes/rcv_list_pv-10-2024-09-18-rcv-en.xml

Lines changed: 16 additions & 0 deletions
Large diffs are not rendered by default.

backend/tests/scrapers/test_votes.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
EurlexDocumentScraper,
99
EurlexProcedureScraper,
1010
ProcedureScraper,
11+
RCVListEnglishScraper,
1112
RCVListScraper,
1213
)
1314

@@ -230,6 +231,43 @@ def test_rcv_list_scraper_timestamp_from_text(responses):
230231
assert data.get("description") is None
231232

232233

234+
def test_rcv_list_english_scraper(responses):
235+
responses.get(
236+
"https://www.europarl.europa.eu/doceo/document/PV-10-2024-09-18-RCV_EN.xml",
237+
body=load_fixture("scrapers/data/votes/rcv_list_pv-10-2024-09-18-rcv-en.xml"),
238+
)
239+
240+
scraper = RCVListEnglishScraper(term=10, date=datetime.date(2024, 9, 18))
241+
data = list(scraper.run())
242+
243+
expected = [
244+
Fragment(
245+
model="Vote",
246+
source_name="RCVListEnglishScraper",
247+
source_id=169418,
248+
group_key=169418,
249+
source_url="https://www.europarl.europa.eu/doceo/document/PV-10-2024-09-18-RCV_EN.xml",
250+
data={
251+
"title": "Objection pursuant to Rule 115(2) and (3), and Rule 115(4)(c): Maximum residue levels for carbendazim and thiophanate-methyl",
252+
},
253+
),
254+
Fragment(
255+
model="Vote",
256+
source_name="RCVListEnglishScraper",
257+
source_id=169419,
258+
group_key=169419,
259+
source_url="https://www.europarl.europa.eu/doceo/document/PV-10-2024-09-18-RCV_EN.xml",
260+
data={
261+
"title": "Objection pursuant to Rule 115(2) and (3), and Rule 115(4)(c): Maximum residue levels for cyproconazole",
262+
},
263+
),
264+
]
265+
266+
assert len(data) == 2
267+
assert record_to_dict(data[0]) == record_to_dict(expected[0])
268+
assert record_to_dict(data[1]) == record_to_dict(expected[1])
269+
270+
233271
def test_procedure_scraper(responses):
234272
responses.get(
235273
"https://oeil.secure.europarl.europa.eu/oeil/en/procedure-file?reference=2023/2019(INI)",

0 commit comments

Comments
 (0)