-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
download scripts isolated from the notebook
- Loading branch information
Showing
1 changed file
with
74 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import sys | ||
from lxml import etree | ||
import requests | ||
from pyquery import PyQuery as pq | ||
import pandas as pd | ||
|
||
seed_pages = [ | ||
'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1213.htm', | ||
'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1314.htm', | ||
'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1415.htm' | ||
] | ||
def crawl_seed(seed): | ||
d = pq(seed) | ||
return d('a').map(lambda i, a: a.attrib.get('name', None)).filter(lambda i, s: s.startswith('cm20')) | ||
meetings = [] | ||
for seed_page in seed_pages: | ||
meetings.extend(crawl_seed(seed_page)) | ||
print(meetings) | ||
|
||
def crawl_xml(meeting): | ||
# This logic is translated from the official JS code | ||
yy, mm, dd = map(lambda i: int(meeting[i:(i + 2)]), [4, 6, 8]) | ||
if mm >= 10: | ||
yr = 'yr%02d-%02d' % (yy, yy + 1) | ||
else: | ||
yr = 'yr%02d-%02d' % (yy - 1, yy) | ||
prefix = 'http://www.legco.gov.hk' | ||
url = '%(prefix)s/%(yr)s/chinese/counmtg/voting/cm_vote_20%(yy)02d%(mm)02d%(dd)02d.xml' % locals() | ||
return requests.get(url) | ||
|
||
vote_xmls = [] | ||
for m in meetings: | ||
vote_xmls.append(crawl_xml(m)) | ||
print('progress: %s/%s %s' % (len(vote_xmls), len(meetings), '#' * len(vote_xmls))) | ||
sys.stdout.flush() | ||
|
||
vote_xmls = filter(lambda r: r.ok, vote_xmls) | ||
vote_xmls = [r.content for r in vote_xmls] | ||
print(len(vote_xmls)) | ||
|
||
# Information fields, useful for reviewing the result | ||
info_fields = ['vote-date', 'vote-time', 'motion-en', 'mover-en', 'mover-type', 'vote-separate-mechanism'] | ||
def xml_to_records(xml): | ||
doc = etree.XML(xml) | ||
records = [] | ||
for topic in doc.xpath('//legcohk-vote/meeting/vote'): | ||
info = [topic.xpath(f)[0].text for f in info_fields] | ||
date = info[0] | ||
topic_id = '%s-%s' % (date, topic.attrib['number']) | ||
for member in topic.xpath('individual-votes/member'): | ||
member_id = member.attrib['name-en'] # Use English name as ID for sipmlicity | ||
vote = member.xpath('vote')[0].text | ||
records.append((topic_id, member_id, vote) + tuple(info)) | ||
return records | ||
|
||
records = [] | ||
for vote_xml in vote_xmls: | ||
records.extend(xml_to_records(vote_xml)) | ||
|
||
# More: | ||
# http://nbviewer.ipython.org/urls/course.ie.cuhk.edu.hk/~engg4030/tutorial/tutorial7/Legco-Preprocessing.ipynb | ||
def clean_record(t): | ||
# According to the numbers, they seem to be the same person | ||
t = list(t) | ||
if t[1] == 'Dr Joseph LEE': | ||
t[1] = 'Prof Joseph LEE' | ||
# Other normalization if any | ||
# ... | ||
return tuple(t) | ||
records = [clean_record(r) for r in records] | ||
|
||
df = pd.DataFrame(records, columns = ['topic_id', 'member_id', 'vote'] + info_fields) | ||
df.to_csv('records-all-with-info.csv', encoding='utf-8') | ||
df.head() |