{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2019-03-07T02:56:32.521682Z", "start_time": "2019-03-07T02:56:26.288106Z" } }, "outputs": [], "source": [ "import pandas as pd, numpy as np\n", "import bs4\n", "import requests, os\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "os.chdir('C:/users/csala/Onedrive - Lancaster University/datarepo/influence/ro')" ] }, { "cell_type": "code", "execution_count": 189, "metadata": {}, "outputs": [], "source": [ "base_url='http://www.cdep.ro'" ] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [], "source": [ "def get_url(par1,par2):\n", " return 'http://www.parlament.ro/pls/steno/evot2015.xml?par1='+par1+'&par2='+par2" ] }, { "cell_type": "code", "execution_count": 154, "metadata": {}, "outputs": [], "source": [ "dates=pd.date_range(start='2006-02-06', end='2019-04-25')" ] }, { "cell_type": "code", "execution_count": 155, "metadata": {}, "outputs": [], "source": [ "votes=[]\n", "parsed_votes=set()" ] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [], "source": [ "for date in dates[:]:\n", " d=str(date)[:10].replace('-','')\n", " #print(d)\n", " url=get_url('1',d)\n", " r=requests.get(url)\n", " soup = bs4.BeautifulSoup(r.content)\n", " for i in soup.findAll(\"row\"):\n", " v=i.find(\"votid\").text\n", " if v not in parsed_votes:\n", " if i.find(\"descriere\"):\n", " desc=i.find(\"descriere\").text\n", " else:\n", " desc=''\n", " t=i.find(\"time_vot\").text\n", " c=i.find(\"camera\").text\n", " votes.append({'votid':v,'descriere':desc,'time':t,'camera':c})\n", " parsed_votes.add(v)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import json" ] }, { "cell_type": "code", "execution_count": 161, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1655007" ] }, "execution_count": 161, "metadata": {}, "output_type": "execute_result" } ], "source": [ "open('data/votes.json','w').write(json.dumps(votes))" ] }, { "cell_type": "code", "execution_count": 160, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "16017" ] }, "execution_count": 160, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(votes)" ] }, { "cell_type": "code", "execution_count": 141, "metadata": {}, "outputs": [], "source": [ "records=[]\n", "parsed_records=set()" ] }, { "cell_type": "code", "execution_count": 142, "metadata": {}, "outputs": [], "source": [ "for vote in votes[:]:\n", " d=vote[0]['votid']\n", " #print(d)\n", " if d not in parsed_records:\n", " url=get_url('2',d)\n", " r=requests.get(url)\n", " soup = bs4.BeautifulSoup(r.content)\n", " for i in soup.findAll(\"row\"):\n", " v=i.find(\"vot\").text\n", " n=i.find(\"nume\").text+' '+i.find(\"prenume\").text\n", " n2=i.find(\"prenume\").text+' '+i.find(\"nume\").text\n", " g=i.find(\"grup\").text\n", " c=i.find(\"camera\").text\n", " records.append({'votid':d,'vot':v,'grup':g,'name':n,'name2':n2,'camera':c})\n", " parsed_records.add(d)" ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "321378064" ] }, "execution_count": 152, "metadata": {}, "output_type": "execute_result" } ], "source": [ "open('data/records.json','w').write(json.dumps(records))" ] }, { "cell_type": "code", "execution_count": 149, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3382156" ] }, "execution_count": 149, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(records)" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "16017" ] }, "execution_count": 163, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(parsed_records)" ] }, { "cell_type": "code", "execution_count": 162, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "16017" ] }, "execution_count": 162, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(votes)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "votes_desc=[]\n", "parsed_years=set()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2019\n" ] } ], "source": [ "for year in range(1990,2020):\n", " if year not in parsed_years:\n", " url='http://parlament.ro/pls/proiecte/upl_pck2015.lista?anp='+str(year)\n", " r=requests.get(url)\n", " soup = bs4.BeautifulSoup(r.content)\n", " tables=soup.findAll('table')\n", " if len(tables)>1:\n", " for tr in tables[1].findAll(\"tr\"):\n", " tds=tr.findAll('td')\n", " if len(tds):\n", " lname=tds[1].find('a').text\n", " llink=tds[1].find('a')['href']\n", " ldesc=tds[2].text.replace('\\n','')\n", " votes_desc.append({'lname':lname,'llink':llink,'ldesc':ldesc})\n", " parsed_years.add(year)\n", " print(year)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "4951920" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "open('data/votes_desc.json','w').write(json.dumps(votes_desc))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "#votes=json.loads(open('data/votes.json','r').read())" ] }, { "cell_type": "code", "execution_count": 187, "metadata": {}, "outputs": [], "source": [ "ldesc={i['lname'][i['lname'].find(' ')+1:i['lname'].find('/')]+'/'+i['lname'][-4:]:i['ldesc'] for i in votes_desc}\n", "llink={i['lname'][i['lname'].find(' ')+1:i['lname'].find('/')]+'/'+i['lname'][-4:]:i['llink'] for i in votes_desc}" ] }, { "cell_type": "code", "execution_count": 202, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "enuluivarianaSenaulu | enuluivarianaSenaulu/ului | enuluivarianaSenaulu/2018 | ului | 2019 | PLenului varianta Senatului\n", "anuldeaciuneeuropeanindomeniulaparariiOM(2016)950(PHD36 | anuldeaciuneeuropeanindomeniulaparariiOM(2016)950(PHD36/17). | anuldeaciuneeuropeanindomeniulaparariiOM(2016)950(PHD36/2018 | 17). | 2019 | PLanul de actiune european in domeniul apararii COM(2016)950 (PH CD 36/2017).\n", "anuldeaciunealUE20172019ombaereadifereneideremunerareinrefemeisibarbaiOM(2017)678(PHD12 | anuldeaciunealUE20172019ombaereadifereneideremunerareinrefemeisibarbaiOM(2017)678(PHD12/18). | anuldeaciunealUE20172019ombaereadifereneideremunerareinrefemeisibarbaiOM(2017)678(PHD12/2018 | 18). | 2019 | PLanul de actiune al UE 2017 - 2019 Combaterea diferentei de remunerare intre femei si barbati COM(2017) 678 (PH CD 12/2018).\n", "anuldeaciunepenrueducaiadigialaOM(2018)22(PHD20 | anuldeaciunepenrueducaiadigialaOM(2018)22(PHD20/18). | anuldeaciunepenrueducaiadigialaOM(2018)22(PHD20/2018 | 18). | 2019 | PLanul de actiune pentru educatia digitala COM(2018) 22 (PH CD 20/2018).\n", "anuldeaciuneprivindFinech:penruunsecorfinanciareuropeanmaicompeiivsimaiinovaorOM(2018)109(PHD33 | anuldeaciuneprivindFinech:penruunsecorfinanciareuropeanmaicompeiivsimaiinovaorOM(2018)109(PHD33/18). | anuldeaciuneprivindFinech:penruunsecorfinanciareuropeanmaicompeiivsimaiinovaorOM(2018)109(PHD33/2018 | 18). | 2019 | PLanul de actiune privind FinTech: pentru un sector financiar european mai competitiv si mai inovator COM(2018)109 (PH CD 33 /2018).\n", "anuldeaciunevizandmobiliaeamiliaraJOIN(2018)5(PHD34 | anuldeaciunevizandmobiliaeamiliaraJOIN(2018)5(PHD34/18). | anuldeaciunevizandmobiliaeamiliaraJOIN(2018)5(PHD34/2018 | 18). | 2019 | PLanul de actiune vizand mobilitatea militara JOIN(2018) 5 (PH CD 34 /2018).\n" ] } ], "source": [ "nvotes=[]\n", "for v in votes:\n", " found=False\n", " d=v['descriere'].replace('Pl','PL')\n", " if 'PL' in d:\n", " base=d[d.find('PL'):]\n", " if base=='PLx 5962010 A':base='PLx 596/2010'\n", " elif base=='PL 301 302/2010 C':base='PLx 301/2010'\n", " elif base=='PLx/2013 240 si PLx 241/2013 pe poz.14 si 15':base='PLx 241/2013'\n", " elif base=='PLx/515/09 C':base='PLx 515/2009'\n", " elif base=='PL 337 338/2009 C':base='PLx 337/2009'\n", " elif base=='PL.643/2011':base='PLx 643/2011'\n", " elif base=='PLx.492/2011':base='PLx 492/2011'\n", " elif base=='PL 566 568 571/2013':base='PLx 566/2013'\n", " lname=base[base.find('PL'):]\n", " lname=lname.replace('PL','').replace(' ','').replace('x','').replace('-','').replace('//','/').replace('//','/')\\\n", " .replace('/A','').replace('/T','').replace('nr.','')\\\n", " .replace(' A','').replace(' C','').replace(' R','')\\\n", " .replace('A','').replace('T','').replace('pct.1','').replace('R','').replace('C','').replace('t','')\\\n", " .replace('/207','/2007').replace('/07','/2007').replace('/08','/2008').replace('/09','/2009')\\\n", " .replace('/10','/2010').replace('/11','/2011').replace('/12','/2012').replace('/13','/2013')\\\n", " .replace('/14','/2014').replace('/15','/2015').replace('/16','/2016').replace('/17','/2017')\\\n", " .replace('/18','/2018').replace('/19','/2019')\n", " year=lname[-4:]\n", " law=lname[:lname.find('/')]\n", " lname=law+'/'+year\n", " \n", " if lname not in ldesc:\n", " try:\n", " lname2=law+'/'+str(int(year)-1)\n", " if lname2 in ldesc:\n", " found=True\n", " v['lung']=ldesc[lname2]\n", " v['lege']=lname2\n", " v['link']=base_url+llink[lname2]\n", " except:\n", " pass\n", " if not found:\n", " for year2 in range(1990,2020):\n", " if not found:\n", " if str(year2) in base:\n", " lname2=law+'/'+str(year2)\n", " if lname2 in ldesc:\n", " found=True\n", " v['lung']=ldesc[lname2]\n", " v['lege']=lname2\n", " v['link']=base_url+llink[lname2]\n", " elif str(year2) in lname:\n", " lname2=law+'/'+str(year2)\n", " if lname2 in ldesc:\n", " found=True\n", " v['lung']=ldesc[lname2]\n", " v['lege']=lname2\n", " v['link']=base_url+llink[lname2]\n", " if not found:\n", " try:\n", " lname2=law+'/'+str(int(year2)-1)\n", " if lname2 in ldesc:\n", " found=True \n", " v['lung']=ldesc[lname2]\n", " v['lege']=lname2\n", " v['link']=base_url+llink[lname2]\n", " except:\n", " pass\n", "\n", " else:\n", " found=True\n", " v['lung']=ldesc[lname]\n", " v['lege']=lname\n", " v['link']=base_url+llink[lname]\n", " \n", " if not found:\n", " print(law,'|',lname,'|',lname2,'|',year,'|',year2,'|',base)\n", " else:\n", " v['type']='PL'\n", " if not found:\n", " v['type']='Other'\n", " v['lung']=v['descriere']\n", " v['lege']=''\n", " v['link']=''\n", " nvotes.append(v)\n", " " ] }, { "cell_type": "code", "execution_count": 203, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "5464124" ] }, "execution_count": 203, "metadata": {}, "output_type": "execute_result" } ], "source": [ "open('data/nvotes.json','w').write(json.dumps(nvotes))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 2 }