{ "metadata": { "name": "", "signature": "sha256:fa80e244f8b4f4d284011064a66cc799c3e264607b7bd2632f6d4c5c13e3b195" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# LegCoHK\n", "\n", "Author: [Pili Hu](http://hupili.net/)\n", "\n", "GitHub repo: https://github.com/hupili/legcohk\n", "\n", "Related notebooks:\n", "\n", " * PCA for [ENGG4030](https://course.ie.cuhk.edu.hk/~engg4030/): http://bit.ly/1riabfV\n", " * Recommender System for [ENGG4030](https://course.ie.cuhk.edu.hk/~engg4030/): http://bit.ly/QwNvLZ\n", " * Graph Analysis for [ENGG4030](https://course.ie.cuhk.edu.hk/~engg4030/): http://bit.ly/1mxjuqu\n", " \n", "Compared with the above notebooks, \n", "this repo contains more compact notes covering the whole data mining flow --\n", "from data collection to final visualization.\n", "Interpretations will note be provided in the notes directly.\n", "If you have interest, you can dump your thoughts on the [issue tracker](https://github.com/hupili/legcohk/issues).\n", "You can also request other features there." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preparation" ] }, { "cell_type": "code", "collapsed": false, "input": [ "%pylab inline\n", "import requests\n", "import pylab as pl\n", "from pyquery import PyQuery as pq\n", "import numpy as np\n", "import matplotlib as plt\n", "import scipy\n", "import pandas as pd\n", "from lxml import etree" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Populating the interactive namespace from numpy and matplotlib\n" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "seed_pages = [\n", " 'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1213.htm',\n", " 'http://www.legco.gov.hk/general/english/counmtg/yr12-16/mtg_1314.htm'\n", "]\n", "def crawl_seed(seed):\n", " d = pq(seed)\n", " return d('a').map(lambda i, a: a.attrib.get('name', None)).filter(lambda i, s: s.startswith('cm20'))\n", "meetings = reduce(list.__add__, map(crawl_seed, seed_pages), [])\n", "print meetings" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "['cm20121010', 'cm20121017', 'cm20121024', 'cm20121031', 'cm20121101', 'cm20121107', 'cm20121114', 'cm20121121', 'cm20121128', 'cm20121205', 'cm20121210', 'cm20121212', 'cm20121219', 'cm20130109', 'cm20130116', 'cm20130117', 'cm20130123', 'cm20130130', 'cm20130206', 'cm20130220', 'cm20130227', 'cm20130320', 'cm20130327', 'cm20130417', 'cm20130424', 'cm20130508', 'cm20130509', 'cm20130515', 'cm20130522', 'cm20130529', 'cm20130605', 'cm20130619', 'cm20130626', 'cm20130703', 'cm20130710', 'cm20130711', 'cm20130717', 'cm20131009', 'cm20131016', 'cm20131017', 'cm20131023', 'cm20131030', 'cm20131106', 'cm20131113', 'cm20131120', 'cm20131127', 'cm20131204', 'cm20131211', 'cm20131218', 'cm20140108', 'cm20140115', 'cm20140116', 'cm20140122', 'cm20140212', 'cm20140219', 'cm20140226', 'cm20140319', 'cm20140326', 'cm20140409', 'cm20140416', 'cm20140430', 'cm20140507', 'cm20140514', 'cm20140521', 'cm20140522', 'cm20140528', 'cm20140604', 'cm20140611', 'cm20140618', 'cm20140625', 'cm20140702', 'cm20140709']\n" ] } ], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "from IPython.core.display import clear_output\n", "import sys\n", "\n", "def crawl_xml(meeting):\n", " # This logic is translated from the official JS code\n", " yy, mm, dd = map(lambda i: int(meeting[i:(i + 2)]), [4, 6, 8])\n", " if mm >= 10:\n", " yr = 'yr%02d-%02d' % (yy, yy + 1)\n", " else:\n", " yr = 'yr%02d-%02d' % (yy - 1, yy)\n", " prefix = 'http://www.legco.gov.hk'\n", " url = '%(prefix)s/%(yr)s/chinese/counmtg/voting/cm_vote_20%(yy)02d%(mm)02d%(dd)02d.xml' % locals()\n", " return requests.get(url)\n", "\n", "vote_xmls = []\n", "for m in meetings:\n", " vote_xmls.append(crawl_xml(m))\n", " clear_output()\n", " print 'progress: %s/%s %s' % (len(vote_xmls), len(meetings), '#' * len(vote_xmls))\n", " sys.stdout.flush()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "progress: 72/72 ########################################################################\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "vote_xmls = filter(lambda r: r.ok, vote_xmls)\n", "vote_xmls = map(lambda r: r.content, vote_xmls)\n", "print len(vote_xmls)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "46\n" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "# Information fields, useful for reviewing the result\n", "info_fields = ['vote-date', 'vote-time', 'motion-en', 'mover-en', 'mover-type', 'vote-separate-mechanism']\n", "def xml_to_records(xml):\n", " doc = etree.XML(xml)\n", " records = []\n", " for topic in doc.xpath('//legcohk-vote/meeting/vote'):\n", " info = [topic.xpath(f)[0].text for f in info_fields]\n", " date = info[0]\n", " topic_id = '%s-%s' % (date, topic.attrib['number'])\n", " for member in topic.xpath('individual-votes/member'):\n", " member_id = member.attrib['name-en'] # Use English name as ID for sipmlicity\n", " vote = member.xpath('vote')[0].text\n", " records.append((topic_id, member_id, vote) + tuple(info))\n", " return records\n", "\n", "records = reduce(list.__add__, map(xml_to_records, vote_xmls), [])" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "# More:\n", "# http://nbviewer.ipython.org/urls/course.ie.cuhk.edu.hk/~engg4030/tutorial/tutorial7/Legco-Preprocessing.ipynb\n", "def clean_record(t):\n", " # According to the numbers, they seem to be the same person\n", " t = list(t)\n", " if t[1] == 'Dr Joseph LEE':\n", " t[1] = 'Prof Joseph LEE'\n", " # Other normalization if any\n", " # ...\n", " return tuple(t)\n", "records = map(clean_record, records)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.DataFrame(records, columns = ['topic_id', 'member_id', 'vote'] + info_fields)\n", "df.to_csv('records-all-with-info.csv', encoding='utf-8')\n", "df[:5]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", " | topic_id | \n", "member_id | \n", "vote | \n", "vote-date | \n", "vote-time | \n", "motion-en | \n", "mover-en | \n", "mover-type | \n", "vote-separate-mechanism | \n", "
---|---|---|---|---|---|---|---|---|---|
0 | \n", "17/10/2012-1 | \n", "TSANG Yok-sing | \n", "Present | \n", "17/10/2012 | \n", "19:37:53 | \n", "AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN... | \n", "Dr Kenneth CHAN | \n", "Member | \n", "Yes | \n", "
1 | \n", "17/10/2012-1 | \n", "Albert HO | \n", "Yes | \n", "17/10/2012 | \n", "19:37:53 | \n", "AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN... | \n", "Dr Kenneth CHAN | \n", "Member | \n", "Yes | \n", "
2 | \n", "17/10/2012-1 | \n", "LEE Cheuk-yan | \n", "Yes | \n", "17/10/2012 | \n", "19:37:53 | \n", "AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN... | \n", "Dr Kenneth CHAN | \n", "Member | \n", "Yes | \n", "
3 | \n", "17/10/2012-1 | \n", "James TO | \n", "Yes | \n", "17/10/2012 | \n", "19:37:53 | \n", "AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN... | \n", "Dr Kenneth CHAN | \n", "Member | \n", "Yes | \n", "
4 | \n", "17/10/2012-1 | \n", "CHAN Kam-lam | \n", "No | \n", "17/10/2012 | \n", "19:37:53 | \n", "AMENDMENT BY DR HON KENNETH CHAN TO HON IP KIN... | \n", "Dr Kenneth CHAN | \n", "Member | \n", "Yes | \n", "
5 rows \u00d7 9 columns
\n", "\n", " | topic_id | \n", "member_id | \n", "vote | \n", "
---|---|---|---|
0 | \n", "17/10/2012-1 | \n", "TSANG Yok-sing | \n", "Present | \n", "
1 | \n", "17/10/2012-1 | \n", "Albert HO | \n", "Yes | \n", "
2 | \n", "17/10/2012-1 | \n", "LEE Cheuk-yan | \n", "Yes | \n", "
3 | \n", "17/10/2012-1 | \n", "James TO | \n", "Yes | \n", "
4 | \n", "17/10/2012-1 | \n", "CHAN Kam-lam | \n", "No | \n", "
5 rows \u00d7 3 columns
\n", "\n", " | Present | \n", "Yes | \n", "No | \n", "Absent | \n", "Abstain | \n", "
---|---|---|---|---|---|
0 | \n", "(TSANG Yok-sing, 1054) | \n", "(LEUNG Kwok-hung, 685) | \n", "(YIU Si-wing, 856) | \n", "(Dr LEUNG Ka-lau, 803) | \n", "(Gary FAN, 238) | \n", "
1 | \n", "(Albert HO, 295) | \n", "(CHAN Chi-chuen, 672) | \n", "(Andrew LEUNG, 854) | \n", "(LEUNG Yiu-chung, 799) | \n", "(MA Fung-kwok, 113) | \n", "
2 | \n", "(Cyd HO, 209) | \n", "(Albert CHAN, 405) | \n", "(Ir Dr LO Wai-kwok, 853) | \n", "(WONG Yuk-man, 788) | \n", "(Prof Joseph LEE, 108) | \n", "
3 | \n", "(WU Chi-wai, 150) | \n", "(Charles Peter MOK, 264) | \n", "(Christopher CHEUNG, 842) | \n", "(Frederick FUNG, 776) | \n", "(IP Kwok-him, 105) | \n", "
4 | \n", "(Charles Peter MOK, 148) | \n", "(Gary FAN, 245) | \n", "(TAM Yiu-chung, 800) | \n", "(James TO, 763) | \n", "(CHAN Kam-lam, 102) | \n", "
5 rows \u00d7 5 columns
\n", "\n", " | Present | \n", "Yes | \n", "No | \n", "Absent | \n", "Abstain | \n", "
---|---|---|---|---|---|
0 | \n", "(YIU Si-wing, 1) | \n", "(Dr LAU Wong-fat, 23) | \n", "(Prof Joseph LEE, 52) | \n", "(TSANG Yok-sing, 1) | \n", "(Frederick FUNG, 11) | \n", "
1 | \n", "(LEUNG Kwok-hung, 1) | \n", "(Abraham SHEK, 60) | \n", "(Claudia MO, 55) | \n", "(Ir Dr LO Wai-kwok, 19) | \n", "(Vincent FANG, 13) | \n", "
2 | \n", "(Andrew LEUNG, 1) | \n", "(Dr LAM Tai-fai, 65) | \n", "(Albert HO, 56) | \n", "(YIU Si-wing, 20) | \n", "(Dennis KWOK, 13) | \n", "
3 | \n", "(CHAN Kin-por, 1) | \n", "(Vincent FANG, 71) | \n", "(Dennis KWOK, 57) | \n", "(POON Siu-ping, 22) | \n", "(Claudia MO, 13) | \n", "
4 | \n", "(LEE Cheuk-yan, 2) | \n", "(Jeffrey LAM, 75) | \n", "(Ronny TONG, 57) | \n", "(TAM Yiu-chung, 30) | \n", "(Dr Kenneth CHAN, 14) | \n", "
5 rows \u00d7 5 columns
\n", "\n", " | 01/02/2013-1 | \n", "01/02/2013-2 | \n", "01/02/2013-3 | \n", "01/02/2013-4 | \n", "01/02/2013-5 | \n", "01/02/2013-6 | \n", "01/02/2013-7 | \n", "01/02/2013-8 | \n", "03/07/2013-1 | \n", "03/07/2013-10 | \n", "03/07/2013-2 | \n", "03/07/2013-3 | \n", "03/07/2013-4 | \n", "03/07/2013-5 | \n", "03/07/2013-6 | \n", "03/07/2013-7 | \n", "03/07/2013-8 | \n", "03/07/2013-9 | \n", "04/07/2013-11 | \n", "04/12/2013-1 | \n", "\n", " |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
TSANG Yok-sing | \n", "Present | \n", "Present | \n", "Present | \n", "Present | \n", "Present | \n", "Present | \n", "Present | \n", "Present | \n", "Present | \n", "Present | \n", "Present | \n", "Present | \n", "Present | \n", "Present | \n", "Present | \n", "Present | \n", "Present | \n", "Present | \n", "Present | \n", "Present | \n", "... | \n", "
Albert HO | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "No | \n", "Yes | \n", "Yes | \n", "No | \n", "Yes | \n", "Yes | \n", "No | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "No | \n", "... | \n", "
LEE Cheuk-yan | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "No | \n", "Yes | \n", "Yes | \n", "No | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Abstain | \n", "Yes | \n", "Yes | \n", "No | \n", "... | \n", "
James TO | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "No | \n", "Yes | \n", "Yes | \n", "No | \n", "Yes | \n", "Yes | \n", "No | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "Yes | \n", "No | \n", "... | \n", "
CHAN Kam-lam | \n", "No | \n", "No | \n", "No | \n", "No | \n", "Abstain | \n", "No | \n", "No | \n", "Yes | \n", "No | \n", "Abstain | \n", "Yes | \n", "Abstain | \n", "Abstain | \n", "Abstain | \n", "Abstain | \n", "Abstain | \n", "Abstain | \n", "Abstain | \n", "Abstain | \n", "Yes | \n", "... | \n", "
5 rows \u00d7 1055 columns
\n", "\n", " | 01/02/2013-1 | \n", "01/02/2013-2 | \n", "01/02/2013-3 | \n", "01/02/2013-4 | \n", "01/02/2013-5 | \n", "01/02/2013-6 | \n", "01/02/2013-7 | \n", "01/02/2013-8 | \n", "03/07/2013-1 | \n", "03/07/2013-10 | \n", "03/07/2013-2 | \n", "03/07/2013-3 | \n", "03/07/2013-4 | \n", "03/07/2013-5 | \n", "03/07/2013-6 | \n", "03/07/2013-7 | \n", "03/07/2013-8 | \n", "03/07/2013-9 | \n", "04/07/2013-11 | \n", "04/12/2013-1 | \n", "\n", " |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
TSANG Yok-sing | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "
Albert HO | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "-1 | \n", "1 | \n", "1 | \n", "-1 | \n", "1 | \n", "1 | \n", "-1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "-1 | \n", "... | \n", "
LEE Cheuk-yan | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "-1 | \n", "1 | \n", "1 | \n", "-1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "0 | \n", "1 | \n", "1 | \n", "-1 | \n", "... | \n", "
James TO | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "-1 | \n", "1 | \n", "1 | \n", "-1 | \n", "1 | \n", "1 | \n", "-1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "-1 | \n", "... | \n", "
CHAN Kam-lam | \n", "-1 | \n", "-1 | \n", "-1 | \n", "-1 | \n", "0 | \n", "-1 | \n", "-1 | \n", "1 | \n", "-1 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "... | \n", "
5 rows \u00d7 1055 columns
\n", "