Skip to content

Commit a9b8057

Browse files
Handle votes with joint responsible committees (#1122)
2 parents 1f5a328 + e1faf83 commit a9b8057

File tree

10 files changed

+2698
-39
lines changed

10 files changed

+2698
-39
lines changed
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
"""Make responsible_committee column JSON in votes table
2+
3+
Revision ID: 8d1995cb0bed
4+
Revises: 064daf473f9a
5+
Create Date: 2025-03-16 17:13:08.000602
6+
7+
"""
8+
9+
import sqlalchemy as sa
10+
from alembic import op
11+
12+
# revision identifiers, used by Alembic.
13+
revision = "8d1995cb0bed"
14+
down_revision = "064daf473f9a"
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade() -> None:
20+
op.drop_column("votes", "responsible_committee")
21+
op.add_column("votes", sa.Column("responsible_committees", sa.JSON))
22+
23+
24+
def downgrade() -> None:
25+
op.drop_column("votes", "responsible_committees")
26+
op.add_column("votes", sa.Column("responsible_committee", sa.Unicode))

backend/howtheyvote/api/serializers.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -263,8 +263,8 @@ class BaseVoteDict(TypedDict):
263263
"""Concepts from the [EuroVoc](https://eur-lex.europa.eu/browse/eurovoc.html) thesaurus
264264
that are related to this vote"""
265265

266-
responsible_committee: CommitteeDict | None
267-
"""Committee responsible for the legislative procedure"""
266+
responsible_committees: list[CommitteeDict] | None
267+
"""Committees responsible for the legislative procedure"""
268268

269269
result: VoteResult | None
270270
"""Vote result. This field is only available for votes starting in 2024."""
@@ -273,9 +273,9 @@ class BaseVoteDict(TypedDict):
273273
def serialize_base_vote(vote: Vote) -> BaseVoteDict:
274274
geo_areas = [serialize_country(geo_area) for geo_area in vote.geo_areas]
275275
eurovoc_concepts = [serialize_eurovoc_concept(ec) for ec in vote.eurovoc_concepts]
276-
responsible_committee = (
277-
serialize_committee(vote.responsible_committee) if vote.responsible_committee else None
278-
)
276+
responsible_committees = [
277+
serialize_committee(committee) for committee in vote.responsible_committees
278+
]
279279

280280
return {
281281
"id": vote.id,
@@ -285,7 +285,7 @@ def serialize_base_vote(vote: Vote) -> BaseVoteDict:
285285
"reference": vote.reference,
286286
"geo_areas": geo_areas,
287287
"eurovoc_concepts": eurovoc_concepts,
288-
"responsible_committee": responsible_committee,
288+
"responsible_committees": responsible_committees,
289289
"result": vote.result,
290290
}
291291

backend/howtheyvote/export/__init__.py

Lines changed: 70 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from ..db import Session
1111
from ..helpers import parse_procedure_reference
12-
from ..models import Country, EurovocConcept, Member, Vote
12+
from ..models import Committee, Country, EurovocConcept, Member, Vote
1313
from ..vote_stats import count_vote_positions
1414
from .csvw_helpers import Table
1515

@@ -149,9 +149,6 @@ class VoteRow(TypedDict):
149149
`OLP_SECOND_READING`, `OLP_THIRD_READING`.This field is only available for votes starting
150150
in 2024 and if the vote is part of an Ordinary Legislative Procedure."""
151151

152-
responsible_committee_code: str | None
153-
"""Committee responsible for the legislative procedure"""
154-
155152
count_for: int
156153
"""Number of MEPs who voted in favor"""
157154

@@ -239,6 +236,29 @@ class GeoAreaVoteRow(TypedDict):
239236
"""Geographic area code"""
240237

241238

239+
class CommitteeRow(TypedDict):
240+
"""Each row represents a committee of the European Parliament."""
241+
242+
code: str
243+
"""Unique identifier of the committee"""
244+
245+
label: str
246+
"""Label"""
247+
248+
abbreviation: str
249+
"""Abbreviation"""
250+
251+
252+
class ResponsibleCommitteeVoteRow(TypedDict):
253+
"""Committee responsible for the legislative procedure a vote is part of."""
254+
255+
vote_id: int
256+
"""Vote ID"""
257+
258+
committee_code: str
259+
"""Committee code"""
260+
261+
242262
class Export:
243263
def __init__(self, outdir: pathlib.Path):
244264
self.outdir = outdir
@@ -313,6 +333,20 @@ def __init__(self, outdir: pathlib.Path):
313333
primary_key=["vote_id", "geo_area_code"],
314334
)
315335

336+
self.committees = Table(
337+
row_type=CommitteeRow,
338+
outdir=self.outdir,
339+
name="committees",
340+
primary_key=["code"],
341+
)
342+
343+
self.responsible_committee_votes = Table(
344+
row_type=ResponsibleCommitteeVoteRow,
345+
outdir=self.outdir,
346+
name="responsible_committee_votes",
347+
primary_key=["vote_id", "committee_code"],
348+
)
349+
316350
def run(self) -> None:
317351
self.fetch_members()
318352
self.write_export_timestamp()
@@ -328,12 +362,15 @@ def run(self) -> None:
328362
self.eurovoc_concept_votes,
329363
self.geo_areas,
330364
self.geo_area_votes,
365+
self.committees,
366+
self.responsible_committee_votes,
331367
]
332368
)
333369
self.export_members()
334370
self.export_votes()
335371
self.export_eurovoc_concepts()
336372
self.export_geo_areas()
373+
self.export_committees()
337374

338375
def fetch_members(self) -> None:
339376
self.members_by_id: dict[int, Member] = {}
@@ -436,6 +473,7 @@ def export_votes(self) -> None:
436473
self.member_votes.open() as member_votes,
437474
self.eurovoc_concept_votes.open() as eurovoc_concept_votes,
438475
self.geo_area_votes.open() as geo_area_votes,
476+
self.responsible_committee_votes.open() as responsible_committee_votes,
439477
):
440478
query = select(Vote).order_by(Vote.id).execution_options(yield_per=500)
441479
result = Session.scalars(query)
@@ -444,10 +482,6 @@ def export_votes(self) -> None:
444482
if idx % 1000 == 0:
445483
log.info("Writing vote", index=idx)
446484

447-
responsible_committee_code = (
448-
vote.responsible_committee.code if vote.responsible_committee else None
449-
)
450-
451485
position_counts = count_vote_positions(vote.member_votes)
452486
procedure_reference = (
453487
parse_procedure_reference(vote.procedure_reference)
@@ -471,7 +505,6 @@ def export_votes(self) -> None:
471505
"procedure_stage": (
472506
vote.procedure_stage.value if vote.procedure_stage else None
473507
),
474-
"responsible_committee_code": responsible_committee_code,
475508
"count_for": position_counts["FOR"],
476509
"count_against": position_counts["AGAINST"],
477510
"count_abstention": position_counts["ABSTENTION"],
@@ -496,6 +529,14 @@ def export_votes(self) -> None:
496529
}
497530
)
498531

532+
for responsible_committee in vote.responsible_committees:
533+
responsible_committee_votes.write_row(
534+
{
535+
"vote_id": vote.id,
536+
"committee_code": responsible_committee.code,
537+
}
538+
)
539+
499540
for member_vote in sorted(vote.member_votes, key=lambda mv: mv.web_id):
500541
member = self.members_by_id[member_vote.web_id]
501542
group = member.group_at(vote.timestamp)
@@ -554,6 +595,26 @@ def export_geo_areas(self) -> None:
554595
}
555596
)
556597

598+
def export_committees(self) -> None:
599+
log.info("Exporting committees")
600+
601+
with self.committees.open() as committees:
602+
exp = func.json_each(Vote.responsible_committees).table_valued("value")
603+
query = (
604+
select(func.distinct(exp.c.value)).select_from(Vote, exp).order_by(exp.c.value)
605+
)
606+
committee_codes = Session.execute(query).scalars()
607+
608+
for committee_code in committee_codes:
609+
committee = Committee[committee_code] if True else None
610+
committees.write_row(
611+
{
612+
"code": committee.code,
613+
"label": committee.label,
614+
"abbreviation": committee.abbreviation,
615+
}
616+
)
617+
557618

558619
def generate_export(path: pathlib.Path) -> None:
559620
with tempfile.TemporaryDirectory() as outdir:

backend/howtheyvote/models/vote.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ class Vote(BaseWithId):
146146
eurovoc_concepts: Mapped[list[EurovocConcept]] = mapped_column(
147147
ListType(EurovocConceptType())
148148
)
149-
responsible_committee: Mapped[Committee] = mapped_column(CommitteeType())
149+
responsible_committees: Mapped[list[Committee]] = mapped_column(ListType(CommitteeType()))
150150
press_release: Mapped[str | None] = mapped_column(sa.Unicode)
151151
issues: Mapped[list[DataIssue]] = mapped_column(ListType(sa.Enum(DataIssue)))
152152

backend/howtheyvote/scrapers/votes.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -503,7 +503,7 @@ def _url(self) -> str:
503503
def _extract_data(self, doc: BeautifulSoup) -> Fragment:
504504
title = self._title(doc)
505505
geo_areas = self._geo_areas(doc)
506-
responsible_committee = self._responsible_committee(doc)
506+
responsible_committees = self._responsible_committees(doc)
507507
self._log.info(
508508
"Extracted procedure information",
509509
title=title,
@@ -517,7 +517,7 @@ def _extract_data(self, doc: BeautifulSoup) -> Fragment:
517517
data={
518518
"procedure_title": title,
519519
"geo_areas": geo_areas,
520-
"responsible_committee": responsible_committee,
520+
"responsible_committees": responsible_committees,
521521
},
522522
)
523523

@@ -562,30 +562,31 @@ def _geo_areas(self, doc: BeautifulSoup) -> list[str]:
562562

563563
return geo_areas
564564

565-
def _responsible_committee(self, doc: BeautifulSoup) -> str | None:
565+
def _responsible_committees(self, doc: BeautifulSoup) -> set[str]:
566+
committees: set[str] = set()
567+
566568
table = doc.select_one(
567-
'#erpl_accordion-committee table:has(th:-soup-contains("Committee responsible"))'
569+
"#erpl_accordion-committee :where("
570+
+ 'table:has(th:-soup-contains("Committee responsible")),'
571+
+ 'table:has(th:-soup-contains("Joint committee responsible"))'
572+
+ ")"
568573
)
569574

570575
if not table:
571-
return None
576+
return committees
572577

573-
if len(table.select("tbody tr")) > 1:
574-
# We assume that there is at most one responsible committee
575-
log.warning("More than one responsible committee found")
578+
badges = table.select("tbody tr .erpl_badge-committee")
576579

577-
badge = table.select_one("tbody tr .erpl_badge-committee")
578-
579-
if not badge:
580-
return None
580+
for badge in badges:
581+
text = badge.text.strip()
582+
committee = Committee.get(text)
581583

582-
text = badge.text.strip()
583-
committee = Committee.get(text)
584+
if not committee:
585+
raise ScrapingError(f"Could not find committee {text}")
584586

585-
if not committee:
586-
raise ScrapingError(f"Could not find committee {text}")
587+
committees.add(committee.code)
587588

588-
return committee.code
589+
return committees
589590

590591

591592
class EurlexProcedureScraper(BeautifulSoupScraper):

backend/howtheyvote/store/mappings.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,9 @@ def map_vote(record: CompositeRecord) -> Vote:
6262
member_votes = [deserialize_member_vote(mv) for mv in record.first("member_votes")]
6363
geo_areas = {Country[code] for code in record.chain("geo_areas")}
6464
eurovoc_concepts = {EurovocConcept[id_] for id_ in record.chain("eurovoc_concepts")}
65-
responsible_committee = Committee.get(record.first("responsible_committee"))
65+
responsible_committees = {
66+
Committee[code] for code in record.chain("responsible_committees")
67+
}
6668
result = VoteResult[record.first("result")] if record.first("result") else None
6769
procedure_stage = (
6870
ProcedureStage[record.first("procedure_stage")]
@@ -91,7 +93,7 @@ def map_vote(record: CompositeRecord) -> Vote:
9193
member_votes=member_votes,
9294
geo_areas=geo_areas,
9395
eurovoc_concepts=eurovoc_concepts,
94-
responsible_committee=responsible_committee,
96+
responsible_committees=responsible_committees,
9597
press_release=press_release,
9698
issues=record.chain("issues"),
9799
)

backend/tests/api/test_votes_api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -571,7 +571,7 @@ def test_votes_api_show(records, db_session, api):
571571
"result": "ADOPTED",
572572
"geo_areas": [],
573573
"eurovoc_concepts": [],
574-
"responsible_committee": None,
574+
"responsible_committees": [],
575575
"related": [],
576576
"sources": [
577577
{

backend/tests/export/test_init.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
from howtheyvote.export import Export
77
from howtheyvote.models import (
8+
Committee,
89
Country,
910
EurovocConcept,
1011
Group,
@@ -140,6 +141,7 @@ def test_export_votes(db_session, tmp_path):
140141
Country["MDA"],
141142
Country["RUS"],
142143
],
144+
responsible_committees=[Committee["AFET"]],
143145
result=VoteResult.ADOPTED,
144146
)
145147

@@ -153,8 +155,8 @@ def test_export_votes(db_session, tmp_path):
153155
votes_meta = tmp_path.joinpath("votes.csv-metadata.json")
154156

155157
expected = (
156-
"id,timestamp,display_title,reference,description,is_main,procedure_reference,procedure_title,procedure_type,procedure_stage,responsible_committee_code,count_for,count_against,count_abstention,count_did_not_vote,result\n"
157-
"123456,2024-01-01 00:00:00,Lorem Ipsum,,,False,2025/1234(COD),Lorem Ipsum,COD,OLP_FIRST_READING,,1,0,0,0,ADOPTED\n"
158+
"id,timestamp,display_title,reference,description,is_main,procedure_reference,procedure_title,procedure_type,procedure_stage,count_for,count_against,count_abstention,count_did_not_vote,result\n"
159+
"123456,2024-01-01 00:00:00,Lorem Ipsum,,,False,2025/1234(COD),Lorem Ipsum,COD,OLP_FIRST_READING,1,0,0,0,ADOPTED\n"
158160
)
159161

160162
assert votes_csv.read_text() == expected
@@ -200,6 +202,24 @@ def test_export_votes(db_session, tmp_path):
200202
assert geo_areas_csv.read_text() == expected
201203
assert geo_areas_meta.is_file()
202204

205+
responsible_committee_votes_csv = tmp_path.joinpath("responsible_committee_votes.csv")
206+
responsible_committee_votes_meta = tmp_path.joinpath(
207+
"responsible_committee_votes.csv-metadata.json"
208+
)
209+
210+
expected = "vote_id,committee_code\n123456,AFET\n"
211+
212+
assert responsible_committee_votes_csv.read_text() == expected
213+
assert responsible_committee_votes_meta.is_file()
214+
215+
committees_csv = tmp_path.joinpath("committees.csv")
216+
committees_meta = tmp_path.joinpath("committees.csv-metadata.json")
217+
218+
expected = "code,label,abbreviation\nAFET,Committee on Foreign Affairs,AFET\n"
219+
220+
assert committees_csv.read_text() == expected
221+
assert committees_meta.is_file()
222+
203223

204224
def test_export_votes_country_group(db_session, tmp_path):
205225
member = Member(

0 commit comments

Comments
 (0)