{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# IPython notebook for ultimateposeur.com article"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This contains the code for an ultimateposeur.com article: <a href='http://www.ultimateposeur.com/2016/06/is-carlos-carballo-most-ruthless.html'>Is Carlos Carballo The Most Ruthless Referee Of Euro 2016?</a>.\n",
    "\n",
    "(The article has also been republished by an Indian national newspaper <i>The Hindu</i> in their web-exclusive <a href='http://www.thehindu.com/thread/sports/article8709613.ece'>thREAD section</a>.)\n",
    "\n",
    "For the github repository click <a href='https://github.com/shijithpk/referees_euro_2016'>here</a>.\n",
    "\n",
    "Each of the code blocks begins with a commented out section explaining what it does.\n",
    "\n",
    "To run this notebook, <a href='http://jupyter.readthedocs.io/en/latest/install.html'>install</a> Jupyter/IPython notebook first.\n",
    "\n",
    "Then <a href='https://github.com/shijithpk/referees_euro_2016/blob/master/IPython_notebook_ultimateposeur_refs_article.ipynb'>download</a> this notebook, <a href='jupyter.readthedocs.io/en/latest/running.html'>open</a> it with Jupyter/IPython and press SHIFT+ENTER in each code block to execute it."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Coming from the journalism world, i've always been surprised by how programming/coding is such a \n",
    "#collaborative effort and how people have no issues building off others' ideas and exchanging info, \n",
    "#in fact they welcome it.\n",
    "#but if i want to sell this as journalism, i'll have to do some 'full disclosure' preliminaries.\n",
    "#so bits and pieces of the code are from stackoverflow, github, udacity, \n",
    "#pythonlearn.com, pycon and scipy videos on youtube, sundry google results etc.\n",
    "#these are all too numerous to recount, so while the individual parts of this aren't original, \n",
    "#the assembly and piecing together of it is original and my own. If anyone sees \n",
    "#a snippet of their code and they want credit for it in the code blocks below\n",
    "#let me know (ultimateposeur@gmail.com) and i'll put a line in.\n",
    "\n",
    "\n",
    "# the code for how i scraped everything won't make sense \n",
    "#unless you're familiar with the soccerway page structure,\n",
    "#but i'm putting this out there in the interest of transparency\n",
    "#and reproducibility. This way you can see that I kind of know\n",
    "#what I'm doing! So i've seen braindumps like this \n",
    "#on the net from others, and while i never really go through\n",
    "#the whole thing, there's usually a piece of code in there that\n",
    "#i can borrow, so hopefully there'll be some google result\n",
    "#that will lead someone to this page and they find \n",
    "#something in it useful!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#so this first part is about getting data from soccerway.com\n",
    "\n",
    "\n",
    "#First I noted down the urls of the referee pages I was interested in \n",
    "#and saved it in 'referee_pages.txt' (check github repo). This is where I \n",
    "#start my crawling from. The pages i'm downloading here have a list of \n",
    "#all the matches of a referee, in some cases going back 10 years\n",
    "\n",
    "#just realised i'm probably not using this notebook right\n",
    "#some notebooks are really gussied up with all sorts of \n",
    "#fancy bold headings etc., I think I'd rather just go \n",
    "#stream of consciousness right now.\n",
    "\n",
    "import re\n",
    "\n",
    "reflist = open('referee_pages.txt')\n",
    "refids = []\n",
    "for i in reflist:\n",
    "    refid = (re.findall(r'\\d+', i))[0]\n",
    "    refids.append(refid)\n",
    "print refids \n",
    "\n",
    "from random import randint\n",
    "import time\n",
    "import requests\n",
    "\n",
    "a = 'http://us.soccerway.com/a/block_player_matches?block_id=page_player_1_block_player_matches_4&callback_params=%7B%22page%22%3A%22%22%2C%22bookmaker_urls%22%3A%5B%5D%2C%22block_service_id%22%3A%22player_matches_block_playermatches%22%2C%22people_id%22%3A%22'\n",
    "c = '%22%2C%22type%22%3A%22%22%2C%22formats%22%3A%22%22%7D&action=changePage&params=%7B%22page%22%3A'\n",
    "e = '%7D'\n",
    "\n",
    "headersx = {\n",
    "    'Host':'us.soccerway.com',\n",
    "    'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0',\n",
    "    'Accept':'text/javascript, text/html, application/xml, text/xml, */*',\n",
    "    'Accept-Language':'en-US,en;q=0.5',\n",
    "    'Accept-Encoding':'gzip, deflate',\n",
    "    'X-Requested-With':'XMLHttpRequest',\n",
    "    'X-Prototype-Version':'1.7',\n",
    "    'Referer':'http://us.soccerway.com/',\n",
    "    'Cookie':'__utma=254272778.1038598711.1464594149.1464631260.1464644537.7; __utmc=254272778; __utmz=254272778.1464603708.3.3.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); s_cc=true; s_fid=0B0C5F8BE6B3EC61-1954A0A8D6FC41BE; s_nr=1464644536442-Repeat; s_ppv=-%2C45%2C45%2C671; s_sq=%5B%5BB%5D%5D; ev4=jonas%20erikkson; __utma=58539694.601279174.1464635680.1464635680.1464635680.1; __utmc=58539694; __utmz=58539694.1464635680.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); prevPage=Players%20%3A%20Pavel%20Kralovec%20%3A%20matches; __utmb=254272778.1.10.1464644537; __utmt=1',\n",
    "    'Connection':'keep-alive'\n",
    "}\n",
    "#if i had to do this differently, I'd put everything into a sub-directory, all the files were just downloaded into my working directory!\n",
    "\n",
    "for b in refids:\n",
    "    d = 0\n",
    "    while d > -21:\n",
    "        urlx = a + b + c + str(d) + e\n",
    "        responsex = requests.get(urlx, headers = headersx)\n",
    "        if d == 0:\n",
    "            dz = '-00'\n",
    "        else:\n",
    "            dz = str(d).zfill(3)\n",
    "        fname = b + dz + '.html'\n",
    "        with open(fname, \"w\") as filex:\n",
    "            filex.write(responsex.content)\n",
    "        d -= 1\n",
    "        print fname\n",
    "        delay = randint(10,20)\n",
    "        time.sleep(delay)\n",
    "    else:\n",
    "        continue\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#executing the command above will download a whole lot of files\n",
    "#that are lists of matches related to each referee\n",
    "#they'll get downloaded into your directory\n",
    "#as I mentioned in the code, it probably would have been better\n",
    "#to put it all into a sub-directory\n",
    "#this code block is just about me creating subdirectories for each \n",
    "#referee, so that I can download each match file and put it into\n",
    "#the directory for the respective referee\n",
    "\n",
    "#refdeux below is the list of referee ids in soccerway.com\n",
    "\n",
    "import os, sys\n",
    "\n",
    "refdeux = ['44515', '44615', '92755', '44516', '45012', '45089', '44971', '85292', '44678', '44486', '44867', '87294', '45554', '44902', '44754', '45059', '45511', '45077']\n",
    "\n",
    "for i in refdeux:\n",
    "    pathx = '/media/C-drive/Users/shijithpk/Downloads/ipython/2016_05_30_referees/' + i + '-full'\n",
    "    os.mkdir(pathx)\n",
    "\n",
    "#so the line above is the path to my working directory. You'll have to replace it with your own path.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#this code block is about me scraping the lists of match \n",
    "#documents from soccerway.com\n",
    "#'edict' the dictionary below has the fields that I noticed\n",
    "# each document has\n",
    "#but the only thing that matters in those documents are links\n",
    "#to the actual matches for each referee\n",
    "#what we've gotten so far from soccerway is just 'lists' of matches\n",
    "#and just to experiment with creating json files\n",
    "#all this data was dumped into json files in each\n",
    "#referee directory\n",
    "\n",
    "import json\n",
    "from lxml import html\n",
    "import sys\n",
    "import os\n",
    "\n",
    "refdeux = ['44515', '44615', '92755', '44516', '45012', '45089', '44971', '85292', '44678', '44486', '44867', '87294', '45554', '44902', '44754', '45059', '45511', '45077']\n",
    "\n",
    "for i in refdeux:\n",
    "    a, b, c, d, e, f, g, h, l = ([] for m in range(9))\n",
    "    edict = dict.fromkeys(['Referee', 'Date', 'Competition', 'Competition_link', 'Home_team', 'Score', 'Match_link', 'Away_team', 'Events'])\n",
    "\n",
    "\n",
    "    for j in range(0,21):\n",
    "        k = str(j).zfill(2)\n",
    "        fnamex = i + '-' + k + '.html'\n",
    "        try:\n",
    "            fhandle = open(fnamex)\n",
    "            sourcex = fhandle.read()\n",
    "            jsource = json.loads(str(sourcex))\n",
    "            contentx = jsource['commands'][0]['parameters']['content']\n",
    "            treex = html.document_fromstring(contentx)\n",
    "            trs = treex.cssselect('table tbody tr')\n",
    "\n",
    "            for tr in trs:\n",
    "                try:\n",
    "                    a.append(tr.cssselect('td:nth-child(2)')[0].text_content())\n",
    "                    b.append(tr.cssselect('td:nth-child(3)')[0].text_content())\n",
    "                    xy = 'http://us.soccerway.com' + tr.cssselect('td:nth-child(3) a')[0].attrib['href']\n",
    "                    c.append(xy)\n",
    "                    d.append(tr.cssselect('td:nth-child(4)')[0].text_content())\n",
    "                    e.append(tr.cssselect('td:nth-child(5)')[0].text_content())\n",
    "                    yz =  'http://us.soccerway.com' + tr.cssselect('td:nth-child(5) a')[0].attrib['href']\n",
    "                    f.append(yz)\n",
    "                    g.append(tr.cssselect('td:nth-child(6)')[0].text_content())\n",
    "                    h.append(tr.cssselect('td:nth-child(7)')[0].text_content())\n",
    "                    l.append(i)\n",
    "                except:\n",
    "                    print \"A row wasnt scraped here.\"\n",
    "        except:\n",
    "            print fnamex + ' didnt work'\n",
    "        print fnamex + ' done'\n",
    "\n",
    "    \n",
    "    edict['Referee'] = l\n",
    "    edict['Date'] = a\n",
    "    edict['Competition'] = b\n",
    "    edict['Competition_link'] = c\n",
    "    edict['Home_team'] = d\n",
    "    edict['Score'] = e\n",
    "    edict['Match_link'] = f\n",
    "    edict['Away_team'] = g\n",
    "    edict['Events'] = h\n",
    "    filenamey = i + '-full/' + i + '-collated.json'\n",
    "    fnamez = os.path.join(os.path.dirname(os.path.realpath('__file__')), filenamey)\n",
    "    json.dump(edict, open(fnamez , 'w')) #used json here instead of csv's just so that I could practise creating json files\n",
    "    print i + ' done'\n",
    "\n",
    "print 'done, done'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#get the values from each key\n",
    "#download each file, set headers, set timeout between requests\n",
    "#make sure files get downloaded into the right folders\n",
    "\n",
    "#so this is where the bulk of the downloading actuall took \n",
    "#place. Because of the 10-20 second delay i placed between requests, it took\n",
    "#around 20 hours or so to download all the files\n",
    "#so what's happening now is files for each match officiated \n",
    "#by each referee (or at least the matches that are there in \n",
    "#soccerway's database), these files are being downloaded\n",
    "\n",
    "#i decided it would be better to just download all the files\n",
    "#and do my scraping for the information i need on yellow cards\n",
    "#times of booking, etc. , do that scraping on my local archive\n",
    "\n",
    "#so that way even if I get my scraping code wrong, it wont\n",
    "#be an issue because i just start over\n",
    "#if i was doing that trial and error by scraping the information\n",
    "#off the internet directly, i would have just been \n",
    "#requesting files over and over again. This local archive would \n",
    "#avoid that\n",
    "\n",
    "#also note that the output can get pretty verbose for this\n",
    "#so if you want you can comment out the print statement\n",
    "#and make it so so that there's only something printed \n",
    "#if somehting goes wrong\n",
    "\n",
    "#all data downloaded on may 31 & june 1\n",
    "\n",
    "\n",
    "from random import randint\n",
    "import time\n",
    "import requests\n",
    "import json\n",
    "import sys\n",
    "import os\n",
    "\n",
    "#furl = '/44486-full/44486-collated.json'\n",
    "\n",
    "refdeux = ['44515', '44615', '92755', '44516', '45012', '45089', '44971', '85292', '44678', '44486', '44867', '87294', '45554', '44902', '44754', '45059', '45511', '45077']\n",
    "\n",
    "headerz = {\n",
    "    'Host':'us.soccerway.com',\n",
    "    'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0',\n",
    "    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',\n",
    "    'Accept-Language':'en-US,en;q=0.5',\n",
    "    'Accept-Encoding':'gzip, deflate',\n",
    "    'Referer':'http://us.soccerway.com/',\n",
    "    'Cookie':'__utma=254272778.1038598711.1464594149.1464682768.1464701479.10; __utmc=254272778; __utmz=254272778.1464603708.3.3.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); s_cc=true; s_fid=0B0C5F8BE6B3EC61-1954A0A8D6FC41BE; s_nr=1464701530831-Repeat; s_ppv=-%2C45%2C45%2C671; s_sq=%5B%5BB%5D%5D; ev4=jonas%20erikkson; __utma=58539694.601279174.1464635680.1464635680.1464635680.1; __utmc=58539694; __utmz=58539694.1464635680.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); prevPage=Players%20%3A%20William%20Collum%20%3A%20matches; __utmb=254272778.3.10.1464701480; __utmt=1',\n",
    "    'Connection':'keep-alive'\n",
    "}\n",
    "\n",
    "for i in refdeux:\n",
    "    filenamey = i + '-full/' + i + '-collated.json'\n",
    "    fnamez = os.path.join(os.path.dirname(os.path.realpath('__file__')), filenamey)\n",
    "    with open(fnamez) as fx:\n",
    "        a = json.load(fx)\n",
    "    urllist = a['Match_link']\n",
    "    print i + '-collated.json ' + ' len-urllist ' + str(len(urllist))\n",
    "    d = 0\n",
    "    for urlx in urllist:\n",
    "        print 'The url in question: ' + urlx\n",
    "        try:\n",
    "            responsex = requests.get(urlx, headers = headerz)\n",
    "            d += 1\n",
    "            zrs = str(d).zfill(3)\n",
    "            fnad =  i + '-full/' + i + '-' + zrs + '.html'\n",
    "            fnadr =  os.path.join(os.path.dirname(os.path.realpath('__file__')), fnad)\n",
    "            with open(fnadr, \"w\") as filex:\n",
    "                filex.write(responsex.content)\n",
    "            print fnad + ' successfully written'\n",
    "            delay = randint(10,20)\n",
    "            time.sleep(delay)\n",
    "        except:\n",
    "            print \"Something went wrong here.\"\n",
    "    print i + ' done'\n",
    "    \n",
    "print 'done and done'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#grr, have kind of forgotten what i've done a few things for!\n",
    "#this wasnt' a straightforward a to b to c thing\n",
    "# it was a to b, then back to a, then c, then back to a, \n",
    "#you get the idea, i've done so many things, that now i realize\n",
    "# i should comment out code whenever i do anything\n",
    "#so that if i have to remind myself why i did something\n",
    "#at least there's some note or something to help me out\n",
    "#i'm writing all these comments a week after\n",
    "#so anyway I figured out what this block was for \n",
    "# it was to get the no. of matches for each refeee that soccerwy\n",
    "#has recorded data for, just wanted to have a sense of what i was getting into\n",
    "\n",
    "\n",
    "import json\n",
    "import sys\n",
    "import os\n",
    "\n",
    "\n",
    "refdeux = ['44515', '44615', '92755', '44516', '45012', '45089', '44971', '85292', '44678', '44486', '44867', '87294', '45554', '44902', '44754', '45059', '45511', '45077']\n",
    "fdict = dict.fromkeys(['refid', 'matches'])\n",
    "a = []\n",
    "b = []\n",
    "\n",
    "for i in refdeux:\n",
    "    a.append(i)\n",
    "    filenamey = i + '-full/' + i + '-collated.json'\n",
    "    fnamez = os.path.join(os.path.dirname(os.path.realpath('__file__')), filenamey)\n",
    "    with open(fnamez) as fx:\n",
    "        d = json.load(fx)\n",
    "    urllist = d['Match_link']\n",
    "    b.append(str(len(urllist)))\n",
    "fdict['refid'] = a\n",
    "fdict['matches'] = b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#so the code below is about creating the main dataset\n",
    "#with info on each card, what type it was - yellow, second yellow\n",
    "#or red, who was booked, what time they were booked, the \n",
    "# team they were playing for at the time etc.\n",
    "\n",
    "#again the output for this is verbose, so if you're \n",
    "#reproducing this, you might want to cut down on the \n",
    "#print statements\n",
    "\n",
    "import re\n",
    "from lxml import html\n",
    "import sys\n",
    "import os\n",
    "import csv\n",
    "\n",
    "reflist = open('referee_pages.txt')\n",
    "refid_list = []\n",
    "refname_list = []\n",
    "#am sure there's a shorter way to do this, couldnt figure it out, what i'm doing below is \n",
    "    #taking an element out of one list and appending it to another list!\n",
    "for i in reflist: \n",
    "    refidx = (re.findall(r'\\d+', i))[0]\n",
    "    refid_list.append(refidx)\n",
    "    refnamex = (re.findall(r'http://us.soccerway.com/referees/(.*)/.*/matches/', i))[0] \n",
    "    #extract name without diacritics, there was a turkish referee name that python completely mangled, this is safer\n",
    "    refname_list.append(refnamex)\n",
    "\n",
    "    \n",
    "headings = ['file_name', 'ref_code', 'ref_name_in_url', 'ref_nationality','match_code', 'match_duration',\n",
    "            'home_team_in_url', 'away_team_in_url', 'date', 'competition','winning_team', \n",
    "            'card_type','minute_card_given', 'player_name_in_url', 'player_nationality_in_url']\n",
    "\n",
    "#write/stream to a csv file, don't hold this in memory\n",
    "\n",
    "with open(\"dataset_main.csv\", \"w\") as filex:   #after this change mode to a for append, otherwise file gets overwritten\n",
    "    wr = csv.writer(filex, delimiter = ',' , quotechar = '\"' )\n",
    "    wr.writerow(headings)\n",
    "\n",
    "\n",
    "for i in refid_list:\n",
    "    d = range(1,450)\n",
    "    for x in d:\n",
    "        zrs = str(x).zfill(3)\n",
    "        fnad =  i + '-full/' + i + '-' + zrs + '.html'\n",
    "        fnadr =  os.path.join(os.path.dirname(os.path.realpath('__file__')), fnad)\n",
    "        try:\n",
    "            with open(fnadr) as filey:\n",
    "                row_list = []\n",
    "                sourcex = filey.read()\n",
    "                treex = html.document_fromstring(sourcex)\n",
    "                #check if referee isn't assistant referee or fourth official\n",
    "                if i not in treex.cssselect('dl.details > dd:nth-child(2) > a:nth-child(1)')[0].attrib['href']:\n",
    "                    continue\n",
    "                #append to the row list in the same order\n",
    "                row_list.append(fnad)\n",
    "                ref_code = i\n",
    "                row_list.append(ref_code)\n",
    "                ref_pos = refid_list.index(i)\n",
    "                ref_name_in_url = refname_list[ref_pos]\n",
    "                row_list.append(ref_name_in_url)\n",
    "                ref_nationality = treex.cssselect('dl.details > dd:nth-child(2) > a:nth-child(1)')[0].attrib['class']\n",
    "                ref_natx = (re.findall(r'flag_16 left_16 (.+?)_16_left referee', ref_nationality))[0]\n",
    "                row_list.append(ref_natx)\n",
    "                match_code = treex.cssselect('#submenu > ul:nth-child(1) > li:nth-child(1) > a:nth-child(1)')[0].attrib['href']\n",
    "                match_codex = (re.findall(r'/matches/.*/.*/.*/.*/.*/.*/.*/(.*)/', match_code))[0]\n",
    "                row_list.append(match_codex)\n",
    "                try:\n",
    "                    if (treex.cssselect('.scoretime > span:nth-child(1)')[0].text_content() or treex.cssselect('.scoretime > span:nth-child(2)')[0].text_content()) == ('E' or 'P'):\n",
    "                        match_duration = 120\n",
    "                        row_list.append(match_duration)\n",
    "                except:\n",
    "                    match_duration = 90\n",
    "                    row_list.append(match_duration)\n",
    "                home_team_in_url = treex.cssselect('#page_match_1_block_match_info_4 > div:nth-child(1) > div:nth-child(1) > h3:nth-child(1) > a:nth-child(1)')[0].attrib['href']\n",
    "                home_teamx = (re.findall(r'/teams/(.*/.*)/.*/', home_team_in_url))[0]\n",
    "                row_list.append(home_teamx)\n",
    "                away_team_in_url = treex.cssselect('#page_match_1_block_match_info_4 > div:nth-child(1) > div:nth-child(3) > h3:nth-child(1) > a:nth-child(1)')[0].attrib['href']\n",
    "                away_teamx = (re.findall(r'/teams/(.*/.*)/.*/', away_team_in_url))[0]\n",
    "                row_list.append(away_teamx)\n",
    "                date_match = treex.cssselect('div.details:nth-child(1) > dl:nth-child(1) > dd:nth-child(4) > a:nth-child(1)')[0].attrib['href']\n",
    "                date_matchx = (re.findall(r'/.*/(.*/.*/.*)/', date_match))[0]\n",
    "                row_list.append(date_matchx)\n",
    "                competition = treex.cssselect('div.details:nth-child(1) > dl:nth-child(1) > dd:nth-child(2) > a:nth-child(1)')[0].attrib['href']\n",
    "                compx = (re.findall(r'/(.*/.*/.*)/.*/.*/.*/', competition))[0]\n",
    "                row_list.append(compx)\n",
    "                scorex = treex.cssselect('.bidi')[0].text_content()\n",
    "                score_list = scorex.split()\n",
    "                if score_list[0] > score_list[2]:\n",
    "                    winning_team = home_teamx\n",
    "                    row_list.append(winning_team)\n",
    "                elif score_list[0] < score_list[2]:\n",
    "                    winning_team = away_teamx\n",
    "                    row_list.append(winning_team)\n",
    "                else:\n",
    "                    winning_team = 'draw'\n",
    "                    row_list.append(winning_team)\n",
    "                cdx = treex.xpath(\"//img[contains(@src,'YC.png')]|//img[contains(@src,'RC.png')]|//img[contains(@src,'Y2C.png')]\")\n",
    "                if cdx == []: #this is just a check for matches without any yellow or red cards\n",
    "                    row_final = list(row_list)\n",
    "                    row_expand = list()\n",
    "                    card_type = 'zero'\n",
    "                    row_expand.append(card_type)\n",
    "                    minute_card_given = ''\n",
    "                    row_expand.append(minute_card_given)\n",
    "                    player_name_in_url = ''\n",
    "                    row_expand.append(player_name_in_url)\n",
    "                    player_nationality_in_url = ''\n",
    "                    row_expand.append(player_nationality_in_url)\n",
    "                    row_final.extend(row_expand)\n",
    "                    with open(\"dataset_main.csv\", \"a\") as filez:\n",
    "                        wrz = csv.writer(filez, delimiter = ',' , quotechar = '\"' )\n",
    "                        wrz.writerow(row_final)\n",
    "                else:\n",
    "                    for cd in cdx:\n",
    "                        row_final = list(row_list)\n",
    "                        row_expand = []\n",
    "                        #make sure youve taken care of all potential slip-ups - mentioned in excel sheet\n",
    "                        card_type = cd.xpath('@src')[0]\n",
    "                        card_typex = (re.findall(r'http://.*/.*/.*/.*/.*/(.*).png', card_type))[0]\n",
    "                        row_expand.append(card_typex)\n",
    "                        minute_card_given = cd.xpath('../text()')[0]\n",
    "                        row_expand.append(minute_card_given)\n",
    "                        #the values in this are weird, some leading white spaces, minute mark at end, 90 + 4 etc., will need further work\n",
    "                        player_name_in_url = cd.xpath('preceding::td[1]//a[1]/@href')[0]\n",
    "                        player_name_in_urlx = (re.findall(r'/.*/(.*)/.*/', player_name_in_url))[0]\n",
    "                        row_expand.append(player_name_in_urlx)\n",
    "                        player_nationality_in_url = cd.xpath('preceding::td[1]//a[1]/@class')[0]\n",
    "                        player_nationality_in_urlx = (re.findall(r'flag_16 left_16 (.+?)_16_left', player_nationality_in_url))[0]\n",
    "                        row_expand.append(player_nationality_in_urlx)\n",
    "                        row_final.extend(row_expand)\n",
    "                        with open(\"dataset_main.csv\", \"a\") as filez:\n",
    "                            wrz = csv.writer(filez, delimiter = ',' , quotechar = '\"' )\n",
    "                            wrz.writerow(row_final)\n",
    "        except:\n",
    "            print 'No file named ', fnadr, ' forget about it.'\n",
    "\n",
    "filez.close() #dont forget to close the csv file once you've finished appending everything\n",
    "\n",
    "print 'done and done'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#now that i have my main dataset, will be doing some cleaning on it\n",
    "#\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "dfg = pd.read_csv('dataset_main.csv')\n",
    "\n",
    "#so i load my csv into a dataframe\n",
    "\n",
    "dfg['card_type'].unique()\n",
    "dfg_filtered = dfg[dfg['card_type'].isin(['YC', 'zero', 'RC', 'Y2C'])]\n",
    "len(dfg_filtered)\n",
    "len(dfg)\n",
    "\n",
    "# did the commands above to check if everything is kosher\n",
    "#turns out some of the pages werent' formatted right\n",
    "#so the wrong info came into the card type column\n",
    "#\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "np.sort(dfg_filtered['minute_card_given'].unique())\n",
    "\n",
    "#this command above will show you that again some rows\n",
    "#didn't record the time of booking properly, so will be \n",
    "#filtering those rows out in the line below\n",
    "\n",
    "\n",
    "dfg_for_A = dfg_filtered[dfg_filtered['minute_card_given'] != \" '\"]\n",
    "\n",
    "dfg_for_A.to_csv('dataset_for_A_v2.csv')\n",
    "dfg_filtered.to_csv('dataset_B_filelist.csv')\n",
    "\n",
    "#this dataset A covers around 50 fewer matches than dataset B\n",
    "#will be using dataset A for time-related analyses and maybe a few other things\n",
    "#And will be using dataset B to generate a list of files with \n",
    "#more or less proper formatting so they can be scraped\n",
    "#not having the proper minute data in dataset B wont matter given my plans for it.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#i ended up not using the data resulting from the code below\n",
    "#this data could help us look at players from which nationalities get penalised more by referees\n",
    "#will be using this data for later article (I call dibs! please dont steal my story idea)\n",
    "#only included this code because i used to create the first graphic in the real article on which\n",
    "#referees are the card happiest\n",
    "\n",
    "import re\n",
    "from lxml import html\n",
    "import sys\n",
    "import os\n",
    "import csv\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv('dataset_B_filelist.csv')\n",
    "nats = df.player_nationality_in_url.unique().tolist()\n",
    "#if you see the results you'll see that the value between australia and cameroon is some weird value\n",
    "#nats.index('australia')\n",
    "#nats.index('cameroon')\n",
    "#nats[27]\n",
    "del nats[27]\n",
    "df_heads = list(df)\n",
    "df_heads_new = df_heads[:11]\n",
    "headingsx = df_heads_new + nats\n",
    "filenames_deux = df.file_name.unique().tolist()\n",
    "\n",
    "with open(\"dataset_B_cards.csv\", \"w\") as filex:   #after this change mode to a for append, otherwise file gets overwritten\n",
    "    wr = csv.writer(filex, delimiter = ',' , quotechar = '\"' )\n",
    "    wr.writerow(headingsx)\n",
    "\n",
    "with open(\"dataset_B_total.csv\", \"w\") as filez:   #after this change mode to a for append, otherwise file gets overwritten\n",
    "    wr = csv.writer(filez, delimiter = ',' , quotechar = '\"' )\n",
    "    wr.writerow(headingsx)\n",
    "\n",
    "for deux in filenames_deux:\n",
    "    fnadr =  os.path.join(os.path.dirname(os.path.realpath('__file__')), deux)\n",
    "    with open(fnadr) as filey:\n",
    "        row_listc = []\n",
    "        row_listt = []\n",
    "        sourcex = filey.read()\n",
    "        treex = html.document_fromstring(sourcex)\n",
    "        row_listc.append(fnad)\n",
    "        row_listt.append(fnad)\n",
    "        ref_code = fnad[0:5]\n",
    "        row_listc.append(ref_code)\n",
    "        row_listt.append(ref_code)\n",
    "        ref_name_url = treex.cssselect('dl.details > dd:nth-child(2) > a:nth-child(1)')[0].attrib['href']\n",
    "        ref_name_urlx = (re.findall(r'/referees/(.*)/.*/', ref_name_url))[0]\n",
    "        row_listc.append(ref_name_urlx)\n",
    "        row_listt.append(ref_name_urlx)\n",
    "        ref_nationality = treex.cssselect('dl.details > dd:nth-child(2) > a:nth-child(1)')[0].attrib['class']\n",
    "        ref_natx = (re.findall(r'flag_16 left_16 (.+?)_16_left referee', ref_nationality))[0]\n",
    "        row_listc.append(ref_natx)\n",
    "        row_listt.append(ref_natx)\n",
    "        match_code = treex.cssselect('#submenu > ul:nth-child(1) > li:nth-child(1) > a:nth-child(1)')[0].attrib['href']\n",
    "        match_codex = (re.findall(r'/matches/.*/.*/.*/.*/.*/.*/.*/(.*)/', match_code))[0]\n",
    "        row_listc.append(match_codex)\n",
    "        row_listt.append(match_codex)\n",
    "        try:\n",
    "            if (treex.cssselect('.scoretime > span:nth-child(1)')[0].text_content() or treex.cssselect('.scoretime > span:nth-child(2)')[0].text_content()) == ('E' or 'P'):\n",
    "                match_duration = 120\n",
    "                row_listc.append(match_duration)\n",
    "                row_listt.append(match_duration)\n",
    "        except:\n",
    "            match_duration = 90\n",
    "            row_listc.append(match_duration)\n",
    "            row_listt.append(match_duration)\n",
    "        home_team_in_url = treex.cssselect('#page_match_1_block_match_info_4 > div:nth-child(1) > div:nth-child(1) > h3:nth-child(1) > a:nth-child(1)')[0].attrib['href']\n",
    "        home_teamx = (re.findall(r'/teams/(.*/.*)/.*/', home_team_in_url))[0]\n",
    "        row_listc.append(home_teamx)\n",
    "        row_listt.append(home_teamx)\n",
    "        away_team_in_url = treex.cssselect('#page_match_1_block_match_info_4 > div:nth-child(1) > div:nth-child(3) > h3:nth-child(1) > a:nth-child(1)')[0].attrib['href']\n",
    "        away_teamx = (re.findall(r'/teams/(.*/.*)/.*/', away_team_in_url))[0]\n",
    "        row_listc.append(away_teamx)\n",
    "        row_listt.append(away_teamx)\n",
    "        date_match = treex.cssselect('div.details:nth-child(1) > dl:nth-child(1) > dd:nth-child(4) > a:nth-child(1)')[0].attrib['href']\n",
    "        date_matchx = (re.findall(r'/.*/(.*/.*/.*)/', date_match))[0]\n",
    "        row_listc.append(date_matchx)\n",
    "        row_listt.append(date_matchx)\n",
    "        competition = treex.cssselect('div.details:nth-child(1) > dl:nth-child(1) > dd:nth-child(2) > a:nth-child(1)')[0].attrib['href']\n",
    "        compx = (re.findall(r'/(.*/.*/.*)/.*/.*/.*/', competition))[0]\n",
    "        row_listc.append(compx)\n",
    "        row_listt.append(compx)\n",
    "        scorex = treex.cssselect('.bidi')[0].text_content()\n",
    "        score_list = scorex.split()\n",
    "        if score_list[0] > score_list[2]:\n",
    "            winning_team = home_teamx\n",
    "            row_listc.append(winning_team)\n",
    "            row_listt.append(winning_team)\n",
    "        elif score_list[0] < score_list[2]:\n",
    "            winning_team = away_teamx\n",
    "            row_listc.append(winning_team)\n",
    "            row_listt.append(winning_team)\n",
    "        else:\n",
    "            winning_team = 'draw'\n",
    "            row_listc.append(winning_team)\n",
    "            row_listt.append(winning_team)\n",
    "        #create player set using | operator\n",
    "        players = treex.xpath(\"//*[@id='page_match_1_block_match_lineups_8']//a[contains(@href,'players')]|//*[@id='page_match_1_block_match_substitutes_9']//p[@class='substitute substitute-in']/a[contains(@href,'players')]/following::img[contains(@src,'SI.png')][1]/preceding::a[1]\")\n",
    "        for nat in nats:\n",
    "            cards = 0\n",
    "            totalx = 0\n",
    "            for player in players:\n",
    "                player_class = player.xpath('@class')[0]\n",
    "                if nat == re.findall(r'flag_16 left_16 (.+?)_16_left', player_class)[0]:\n",
    "                    #set up card finder\n",
    "                    if player.xpath(\"following::td[1]//img[contains(@src,'YC.png') or contains(@src,'RC.png') or contains(@src,'Y2C.png')]\") != []:\n",
    "                        cards += 1\n",
    "                        totalx += 1\n",
    "                    else:\n",
    "                        totalx += 1\n",
    "                else:\n",
    "                    continue\n",
    "            row_listc.append(cards)\n",
    "            row_listt.append(totalx)\n",
    "        \n",
    "        with open(\"dataset_B_cards.csv\", \"a\") as filex: \n",
    "            wr = csv.writer(filex, delimiter = ',' , quotechar = '\"' )\n",
    "            wr.writerow(row_listc)\n",
    "\n",
    "        with open(\"dataset_B_total.csv\", \"a\") as filez: \n",
    "            wr = csv.writer(filez, delimiter = ',' , quotechar = '\"' )\n",
    "            wr.writerow(row_listt)\n",
    "\n",
    "filex.close()\n",
    "filez.close()\n",
    "print 'done and done'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#code below was written to figure out which referees\n",
    "# are particularly trigger happy when it comes to bookings and cards\n",
    "#\n",
    "\n",
    "df_world_total = pd.read_csv('dataset_B_total.csv')\n",
    "df_world_cards = pd.read_csv('dataset_B_cards.csv')\n",
    "\n",
    "#dont remember why but for some reason, i wasnt happy with \n",
    "#the dataframe loaded from the csv, so i did some slicing\n",
    "#below to create a new one\n",
    "\n",
    "df_world_cards.columns.get_loc(\"northern-ireland\")\n",
    "df_world_cards.columns.get_loc(\"equatorial-guinea\")\n",
    "df_world_cards.columns.get_loc(\"ref_name_in_url\")\n",
    "df_world_cards.columns.get_loc(\"match_code\")\n",
    "df_world_cards.columns.get_loc(\"competition\")\n",
    "\n",
    "\n",
    "card_slices = [df_world_cards.ix[:,2], df_world_cards.ix[:,4],df_world_cards.ix[:,9],df_world_cards.ix[:,11:160]]\n",
    "df_world_cards_new = pd.concat(card_slices, axis=1)\n",
    "\n",
    "total_slices = [df_world_total.ix[:,2], df_world_total.ix[:,4],df_world_total.ix[:,9],df_world_total.ix[:,11:160]]\n",
    "df_world_total_new = pd.concat(total_slices, axis=1)\n",
    "\n",
    "#these four lines below sum up the number of appearances \n",
    "#the 'total new sum' dataframe and cards for all players \n",
    "#in all matches they have appeared in\n",
    "#note that this wont take into cards for players\n",
    "#whey they've been on the subs bench (that happens sometimes)\n",
    "\n",
    "df_world_total_new_sum = df_world_total_new.groupby('ref_name_in_url').sum()\n",
    "df_world_cards_new_sum = df_world_cards_new.groupby('ref_name_in_url').sum()\n",
    "\n",
    "#this is just a simple no. of cards for all players divided by\n",
    "#no of appearances for all players\n",
    "\n",
    "b = df_world_cards_new_sum.sum(axis=1)\n",
    "c = df_world_total_new_sum.sum(axis=1)\n",
    "df_world_ratio_new = (b/c)*100\n",
    "\n",
    "graphic_A = [e, c, b, df_world_ratio_new]\n",
    "df_for_graphic_A = pd.concat(graphic_A, axis=1)\n",
    "\n",
    "df_for_graphic_A.columns = ['no_of_matches', 'player_appearances', 'cards', 'cards_to_appearance_ratio']\n",
    "df_for_graphic_A.to_csv('graphic_A.csv')\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "dfg = pd.read_csv('dataset_for_A_v2.csv')\n",
    "del dfg['Unnamed: 0']\n",
    "#deleting the index that got added on as a column for some reason\n",
    "dfg.minute_card_given.fillna('',inplace=True)\n",
    "dfg.player_name_in_url.fillna('zero',inplace=True)\n",
    "dfg.player_nationality_in_url.fillna('zero',inplace=True)\n",
    "\n",
    "#did the commands above because there were a lot of null \n",
    "#values created because of the matches where there were no \n",
    "#cards, so just filled them up with 'zero'\n",
    "dfg['minute_card_given'] = dfg['minute_card_given'].str.strip(\" \") #remove whitespace at beginning\n",
    "dfg['minute_card_given'] = dfg['minute_card_given'].str.strip(\"'\") #remove minute mark (')\n",
    "dfg[dfg['minute_card_given'].str.contains(r\"\\+\")].groupby('minute_card_given').match_code.nunique()\n",
    "#checking how many cards are given in injury time\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#so because there are a lot of cards given in injury time\n",
    "# there are a lot of cards where the minutes added on are denoted\n",
    "#by '+', so 45+3, 90+1 etc.\n",
    "#this was messing up with my calculations because they werent\n",
    "#being recognised as numbers, so i wrote the code below\n",
    "#to take care of this by, wait for it, extending each half by\n",
    "#five minutes! so the second half starts at minute 50 instead of \n",
    "#45, if there's extra time, extra time starts at minute 100\n",
    "#instead of 90 etc. I did all this because i was thinking\n",
    "#of doing something related to mean time between cards\n",
    "#but dropped that idea later\n",
    "\n",
    "fake_list4 = []\n",
    "for s in dfg['minute_card_given']:\n",
    "    if '+' in s:\n",
    "        t = s.split(\"+\")\n",
    "        if int(t[0]) == 45:\n",
    "            if int(t[1]) > 5:\n",
    "                v = 50\n",
    "            elif int(t[1]) <= 5:\n",
    "                v = 45 + int(t[1])\n",
    "        elif int(t[0]) == 90:\n",
    "            if int(t[1]) > 5:\n",
    "                v = 100\n",
    "            elif int(t[1]) <= 5:\n",
    "                v = 95 + int(t[1])\n",
    "        elif int(t[0]) == 105:\n",
    "            v = 115\n",
    "        elif int(t[0]) == 120:\n",
    "            v = 130\n",
    "        fake_list4.append(v)\n",
    "    else:\n",
    "        try:\n",
    "            t = int(s)\n",
    "            if t > 90:\n",
    "                v = t + 10\n",
    "            elif t > 45:\n",
    "                v = t + 5\n",
    "            elif t > 0:\n",
    "                v = t\n",
    "            fake_list4.append(v)\n",
    "        except:\n",
    "            fake_list4.append('none') \n",
    "\n",
    "fake_list4\n",
    "\n",
    "df_02= pd.DataFrame({'min_card_new': fake_list4})\n",
    "#am adding this transformed times column named 'min_card_new'\n",
    "#to the dataframe\n",
    "df_card_analysis = pd.concat([dfg, df_02], axis=1)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#taking out matches where there was no card given\n",
    "df_first_card = df_card_analysis[df_card_analysis['min_card_new'] != 'none']\n",
    "\n",
    "#now the analysis\n",
    "\n",
    "#this code below is to calculate the mean time of the first yellow and red card\n",
    "df_first_yellow = df_first_card[df_first_card['card_type'] == 'YC']\n",
    "df_first_red = df_first_card[df_first_card['card_type'] == 'RC']\n",
    "df_first_yellow_mean = df_first_yellow.groupby(['ref_name_in_url','file_name'], as_index=False)['min_card_new'].min().groupby('ref_name_in_url')['min_card_new'].mean()\n",
    "df_first_red_mean = df_first_red.groupby(['ref_name_in_url','file_name'], as_index=False)['min_card_new'].min().groupby('ref_name_in_url')['min_card_new'].mean()\n",
    "\n",
    "first_combo = [df_first_yellow_mean, df_first_red_mean]\n",
    "df_first_combo = pd.concat(first_combo, axis=1) #putting the two dataframes together\n",
    "\n",
    "df_first_combo.columns = ['first_yellow_mean', 'first_red_mean']\n",
    "#giving this dataframe new column names above\n",
    "\n",
    "df_first_combo.to_csv('graphic_first_card.csv')\n",
    "#putting the data in a csv so that i can use it in something \n",
    "#like Tableau to create the visualisation. Wasnt happy\n",
    "#with the options within the Python world, every graphic\n",
    "#looks so science-y, which is understandable given their \n",
    "#origins in academia, but all of them -- matplotlib, seaborn,\n",
    "#bokeh, they all look so bad!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#so we are trying to figure out how many times a referee took out a 2nd yellow card for a player and sent \n",
    "#him off. so to do that we first find out how many yellow cards were given by each ref\n",
    "#we then divide the no. of matches where theres a Y2C, ie. 2nd yellow leading to a red, and so we get the \n",
    "#conversion ratio for yellows to reds for each referee\n",
    "\n",
    "#len(df_first_card) = 19559\n",
    "df_card_04 = df_first_card[df_first_card['card_type'] == 'YC']\n",
    "#len(df_card_04) = 18459\n",
    "df_card_05 = df_card_04.groupby('ref_name_in_url')['min_card_new'].count()\n",
    "#this line above gives us total no of yellow cards per referee (excluding second yellows and straight reds)\n",
    "\n",
    "#now to get the no of matches where a second yellow led to a red\n",
    "#you do this by restricting the frame to matches where there are second yellows\n",
    "\n",
    "df_card_06 = df_first_card[df_first_card['card_type'] == 'Y2C']\n",
    "#this line below gives us total no of second yellows per referee (and excludes straight reds)\n",
    "df_card_07 = df_card_06.groupby('ref_name_in_url')['min_card_new'].count()\n",
    "\n",
    "df_card_08 = (df_card_07/df_card_05)*100\n",
    "#above is the calculation of the percentage of \n",
    "#yellow cards converted into second yellows\n",
    "#this is all possible because soccerway helpfully\n",
    "#has a second yellow category of card in addition\n",
    "#to the normal yellow and red card\n",
    "\n",
    "#the lines below concatenate the frames together and saves\n",
    "#everything in a csv file\n",
    "card_conversion = [df_card_05, df_card_07, df_card_08]\n",
    "df_card_convert = pd.concat(card_conversion, axis=1)\n",
    "df_card_convert_v2 = df_card_convert.reset_index()\n",
    "df_card_convert_v2.columns = ['referee_name', 'total_yellow_cards', 'total_second_yellows', 'ratio']\n",
    "df_card_convert_v2.to_csv('graphic_y_to_r.csv', index=False)\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#for the next graphic, we find out who has historically pulled card after card after card in a match\n",
    "df_card_total_v2 = df_first_card.groupby('ref_name_in_url')['min_card_new'].count()\n",
    "#this gives us no. of cards per referee (yellows, second yellows and reds)\n",
    "#this is the base that we'll be using to calulate percentages of first cards, second cards, thrid cards etc.\n",
    "\n",
    "#df_first_card is the dataframe where the matches without any cards shown have been excluded\n",
    "\n",
    "df_card_percent_first = df_first_card.sort_values('min_card_new').groupby(['ref_name_in_url','file_name']).head(1).groupby('ref_name_in_url')['min_card_new'].count()\n",
    "\n",
    "#code below will help us calculate the percentages and save\n",
    "#the dataframe as a csv\n",
    "#being a newbie, the code below took me a few hours to figure out\n",
    "#probably the first time I defined my own function too\n",
    "#outside of an online tutorial\n",
    "\n",
    "def card(t):\n",
    "    return df_first_card.sort_values('min_card_new').groupby(['ref_name_in_url','file_name']).head(t).groupby('ref_name_in_url')['min_card_new'].count()\n",
    "\n",
    "tot = df_card_total_v2\n",
    "t = 1\n",
    "sr_concat = []\n",
    "label_list = []\n",
    "sr_tot = pd.Series(index=df_card_total_v2.index, dtype='float64') #create empty series\n",
    "sr_tot = sr_tot.fillna(0) # fill with 0s rather than NaNs , thank you stackoverlow!\n",
    "sr_100v = pd.Series(data=100,index=df_card_total_v2.index, dtype='float64')\n",
    "#sr_100v = sr_tot.fillna(100.0) this wasnt working for some reason, could be because of some inplace nonsense\n",
    "\n",
    "print sr_100v\n",
    "\n",
    "while t < 6:\n",
    "    print \"t= \", t\n",
    "    z = ((card(t) - card(t-1))/tot)*100\n",
    "    if z.isnull().any():\n",
    "        xy = (card(t)/tot)*100\n",
    "    else:\n",
    "        xy = z\n",
    "    print 'xy'\n",
    "    print xy\n",
    "    sr_concat.append(xy)\n",
    "    sr_tot += xy\n",
    "    #print 'sr_tot'\n",
    "    #print sr_tot\n",
    "\n",
    "    #print 'sr_concat'\n",
    "    #print sr_concat\n",
    "    label_name = 'card_' + str(t)\n",
    "    label_list.append(label_name)\n",
    "    t += 1\n",
    "\n",
    "print sr_tot\n",
    "xy = sr_100v - sr_tot\n",
    "print \"t=6\", 6\n",
    "#print 'xy'\n",
    "print xy\n",
    "sr_concat.append(xy)\n",
    "#print 'sr_concat'\n",
    "#print sr_concat\n",
    "label_name = 'the_rest'\n",
    "label_list.append(label_name)\n",
    "\n",
    "df_card_percent = pd.concat(sr_concat, axis=1) #putting the two dataframes together\n",
    "#df_card_convert_v2 = df_card_convert.reset_index() \n",
    "df_card_percent.columns = label_list\n",
    "df_card_percent.to_csv('graphic_card_percent.csv')\n",
    "#sometimes \", index = False\" works better when saving to a csv and the first column doesnt have a label\n",
    "\n",
    "print 'done and done'\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11+"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}