{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# pandas I/O tools and examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div id='toc-container'><script type='text/javascript'>\n",
       "$(function() {\n",
       "    function regenTOC(){\n",
       "        element = $(\"#toc-container\");\n",
       "\n",
       "\tvar toc = document.createElement(\"div\");\n",
       "\t$(toc).attr(\"class\", \"table-of-contents\");\n",
       "\n",
       "\tvar curLevel = 0;\n",
       "\tvar containerStack = [toc];\n",
       "\tvar levelOfTag = {\"h2\": 1, \"h3\": 2, \"h4\": 3, \"h5\": 4};\n",
       "\n",
       "\tfunction pushLevel() {\n",
       "            var list = document.createElement(\"ul\");\n",
       "            containerStack.push(list);\n",
       "            curLevel++;\n",
       "\t}\n",
       "\t\n",
       "\tfunction popLevel() {\n",
       "            var lastContainer = containerStack.pop();\n",
       "            $(lastContainer).appendTo(containerStack[containerStack.length - 1]);\n",
       "            curLevel--;\n",
       "\t}\n",
       "\t\n",
       "\t$(\".text_cell_render :header\").each(function (i, elem) {\n",
       "            var level = levelOfTag[ elem.tagName.toLowerCase() ];\n",
       "\n",
       "            if (level === undefined)\n",
       "\t\treturn;\n",
       "\n",
       "            while (curLevel < level)\n",
       "\t\tpushLevel();\n",
       "            while (curLevel > level)\n",
       "\t\tpopLevel();\n",
       "            \n",
       "            var listItem = document.createElement(\"li\");\n",
       "            var link = document.createElement(\"a\");\n",
       "            $(link)\n",
       "\t\t.text($(elem).contents().first().text()) // Remove the pilcrow sign\n",
       "\t\t.attr(\"href\", \"#\" + $(elem).attr(\"id\"))\n",
       "\t\t.appendTo(listItem);\n",
       "            $(listItem).appendTo(containerStack[containerStack.length - 1]);\n",
       "\t});\n",
       "\t\n",
       "\twhile (curLevel > 0)\n",
       "            popLevel();\n",
       "\n",
       "        $(\"<a class='btn-update' href='#'>Update</a>\")\n",
       "          .click(regenTOC).prependTo(toc);\n",
       "\n",
       "\t$(toc).prepend(\"<div class='title'>Contents</div>\")\n",
       "          .wrap(\"<div class='toc-headings'/>\");\n",
       "\n",
       "        $(element).empty();\n",
       "        $(element).append(toc);\n",
       "    }\n",
       "\n",
       "    if (typeof(IPython) !== 'undefined')\n",
       "        $([IPython.events]).on('notebook_loaded.Notebook', regenTOC);\n",
       "    regenTOC();\n",
       "});\n",
       "\n",
       "</script></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import addutils.toc ; addutils.toc.js(ipy_notebook=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".text_cell_render @font-face {\n",
       "    font-family: \"Computer Modern\";\n",
       "    src: url('http://mirrors.ctan.org/fonts/cm-unicode/fonts/otf/cmunss.otf');\n",
       "}\n",
       "\n",
       "div.cell {\n",
       "    width: 900px;\n",
       "    margin-left: 0% !important;\n",
       "    margin-right: 0%;\n",
       "}\n",
       "\n",
       "code {\n",
       "    font-size:10pt;\n",
       "}\n",
       "\n",
       ".text_cell_render  h1 {\n",
       "    font-family: Tahoma, sans-serif;\n",
       "    color: rgb( 10, 88, 126 );\n",
       "    font-size:28pt;\n",
       "}\n",
       ".text_cell_render h2 {\n",
       "    font-family: Tahoma, sans-serif;\n",
       "    color: rgb( 10, 88, 126 );\n",
       "    font-size:24pt;\n",
       "}\n",
       ".text_cell_render h3 {\n",
       "    font-family: Tahoma, sans-serif;\n",
       "    color: rgb( 10, 88, 126 );\n",
       "    font-size:20pt;\n",
       "}\n",
       ".text_cell_render h4 {\n",
       "    font-family: Tahoma, sans-serif;\n",
       "    color: rgb( 10, 88, 126 );\n",
       "    font-size:18pt;\n",
       "    margin-top:12px;\n",
       "    margin-bottom: 3px;\n",
       "}\n",
       "\n",
       ".text_cell_render h5 {\n",
       "    font-weight: 300;\n",
       "    font-size: 11pt;\n",
       "    color: rgb( 48, 48, 48 );\n",
       "    font-style: italic;\n",
       "    margin-bottom: .5em;\n",
       "    margin-top: 0.5em;\n",
       "    display: block;\n",
       "}\n",
       "\n",
       ".text_cell_render ul {\n",
       "    font-family: Tahoma, sans-serif;\n",
       "    color: rgb( 90, 90, 90 );\n",
       "    font-size:11pt;\n",
       "    line-height: 185%;\n",
       "}\n",
       "\n",
       ".text_cell_render yp {\n",
       "    font-family: Tahoma, sans-serif;\n",
       "    color: rgb( 90, 90, 90 );\n",
       "    font-size:11pt;\n",
       "}\n",
       "\n",
       ".text_cell_render strong {\n",
       "    font-family: Tahoma, sans-serif;\n",
       "    color: rgb( 30, 30, 30 );\n",
       "    font-size:11pt;\n",
       "}\n",
       "\n",
       ".text_cell_render a:link {\n",
       "    font-family: Tahoma, sans-serif;\n",
       "    color: rgb( 10, 88, 126 );\n",
       "    font-size:11pt;\n",
       "}\n",
       "\n",
       ".text_cell_render a:visited {\n",
       "    color:rgb( 10, 88, 126 );\n",
       "}\n",
       "\n",
       ".text_cell_render {\n",
       "    font-family: Helvetica, Courier, Computer Modern, \"Helvetica Neue\", Arial, Geneva, sans-serif;\n",
       "    color: rgb( 84, 84, 84 );\n",
       "    font-size:11pt;\n",
       "    line-height: 125%;\n",
       "    font-size: 100%;\n",
       "    width:800px;\n",
       "}\n",
       "\n",
       ".CodeMirror {\n",
       "    font-family: Courier, \"Source Code Pro\", source-code-pro,Consolas, monospace;\n",
       "}\n",
       "\n",
       ".warning {\n",
       "    color: rgb( 240, 20, 20 );\n",
       "}\n",
       "\n",
       "/* Pandas tables */\n",
       "/*\n",
       ".rendered_html td {\n",
       "    text-align: right;\n",
       "}\n",
       "*/\n",
       "\n",
       "table.dataframe td {\n",
       "    text-align: right;\n",
       "}\n",
       "\n",
       ".output .table-of-contents {\n",
       "    border: 1px #cecece solid;\n",
       "    background-color: #fafafa;\n",
       "    padding-top: 10px;\n",
       "    padding-bottom: 5px;\n",
       "    padding-right: 15px;\n",
       "    padding-left: 0px;\n",
       "    margin-bottom: 20px;\n",
       "    display: inline-block;\n",
       "    position: relative;\n",
       "}\n",
       "\n",
       ".output .table-of-contents ul {\n",
       "    list-style-type: none;\n",
       "    padding-left: 20px;\n",
       "}\n",
       "\n",
       ".output .table-of-contents .title {\n",
       "    font-weight: bold;\n",
       "    font-height: 11pt;\n",
       "    padding-left: 20px; /* looks better if it's the same to the <ul> */\n",
       "}\n",
       "\n",
       ".output .table-of-contents .btn-update {\n",
       "    position: absolute;\n",
       "    float: right;\n",
       "    right: 11px;\n",
       "    top: 4px;\n",
       "    font-size: 9pt;\n",
       "}\n",
       "\n",
       "</style>\n",
       "<script>\n",
       "    MathJax.Hub.Config({\n",
       "                        TeX: {\n",
       "                           extensions: [\"AMSmath.js\"]\n",
       "                           },\n",
       "                displayAlign: 'center', // Change this to 'center' to center equations.\n",
       "                \"HTML-CSS\": {\n",
       "                    styles: {'.MathJax_Display': {\"margin\": 4}}\n",
       "                }\n",
       "        });\n",
       "</script>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from addutils import css_notebook\n",
    "css_notebook()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1 Matlab Variables"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1.1 Import a Matlab variable from file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>\n",
       ".text_cell_render @font-face {\n",
       "    font-family: \"Computer Modern\";\n",
       "    src: url('http://mirrors.ctan.org/fonts/cm-unicode/fonts/otf/cmunss.otf');\n",
       "}\n",
       "\n",
       "div.cell {\n",
       "    width: 900px;\n",
       "    margin-left: 0% !important;\n",
       "    margin-right: 0%;\n",
       "}\n",
       "\n",
       "code {\n",
       "    font-size:10pt;\n",
       "}\n",
       "\n",
       ".text_cell_render  h1 {\n",
       "    font-family: Tahoma, sans-serif;\n",
       "    color: rgb( 10, 88, 126 );\n",
       "    font-size:28pt;\n",
       "}\n",
       ".text_cell_render h2 {\n",
       "    font-family: Tahoma, sans-serif;\n",
       "    color: rgb( 10, 88, 126 );\n",
       "    font-size:24pt;\n",
       "}\n",
       ".text_cell_render h3 {\n",
       "    font-family: Tahoma, sans-serif;\n",
       "    color: rgb( 10, 88, 126 );\n",
       "    font-size:20pt;\n",
       "}\n",
       ".text_cell_render h4 {\n",
       "    font-family: Tahoma, sans-serif;\n",
       "    color: rgb( 10, 88, 126 );\n",
       "    font-size:18pt;\n",
       "    margin-top:12px;\n",
       "    margin-bottom: 3px;\n",
       "}\n",
       "\n",
       ".text_cell_render h5 {\n",
       "    font-weight: 300;\n",
       "    font-size: 11pt;\n",
       "    color: rgb( 48, 48, 48 );\n",
       "    font-style: italic;\n",
       "    margin-bottom: .5em;\n",
       "    margin-top: 0.5em;\n",
       "    display: block;\n",
       "}\n",
       "\n",
       ".text_cell_render ul {\n",
       "    font-family: Tahoma, sans-serif;\n",
       "    color: rgb( 90, 90, 90 );\n",
       "    font-size:11pt;\n",
       "    line-height: 185%;\n",
       "}\n",
       "\n",
       ".text_cell_render yp {\n",
       "    font-family: Tahoma, sans-serif;\n",
       "    color: rgb( 90, 90, 90 );\n",
       "    font-size:11pt;\n",
       "}\n",
       "\n",
       ".text_cell_render strong {\n",
       "    font-family: Tahoma, sans-serif;\n",
       "    color: rgb( 30, 30, 30 );\n",
       "    font-size:11pt;\n",
       "}\n",
       "\n",
       ".text_cell_render a:link {\n",
       "    font-family: Tahoma, sans-serif;\n",
       "    color: rgb( 10, 88, 126 );\n",
       "    font-size:11pt;\n",
       "}\n",
       "\n",
       ".text_cell_render a:visited {\n",
       "    color:rgb( 10, 88, 126 );\n",
       "}\n",
       "\n",
       ".text_cell_render {\n",
       "    font-family: Helvetica, Courier, Computer Modern, \"Helvetica Neue\", Arial, Geneva, sans-serif;\n",
       "    color: rgb( 84, 84, 84 );\n",
       "    font-size:11pt;\n",
       "    line-height: 125%;\n",
       "    font-size: 100%;\n",
       "    width:800px;\n",
       "}\n",
       "\n",
       ".CodeMirror {\n",
       "    font-family: Courier, \"Source Code Pro\", source-code-pro,Consolas, monospace;\n",
       "}\n",
       "\n",
       ".warning {\n",
       "    color: rgb( 240, 20, 20 );\n",
       "}\n",
       "\n",
       "/* Pandas tables */\n",
       "/*\n",
       ".rendered_html td {\n",
       "    text-align: right;\n",
       "}\n",
       "*/\n",
       "\n",
       "table.dataframe td {\n",
       "    text-align: right;\n",
       "}\n",
       "\n",
       ".output .table-of-contents {\n",
       "    border: 1px #cecece solid;\n",
       "    background-color: #fafafa;\n",
       "    padding-top: 10px;\n",
       "    padding-bottom: 5px;\n",
       "    padding-right: 15px;\n",
       "    padding-left: 0px;\n",
       "    margin-bottom: 20px;\n",
       "    display: inline-block;\n",
       "    position: relative;\n",
       "}\n",
       "\n",
       ".output .table-of-contents ul {\n",
       "    list-style-type: none;\n",
       "    padding-left: 20px;\n",
       "}\n",
       "\n",
       ".output .table-of-contents .title {\n",
       "    font-weight: bold;\n",
       "    font-height: 11pt;\n",
       "    padding-left: 20px; /* looks better if it's the same to the <ul> */\n",
       "}\n",
       "\n",
       ".output .table-of-contents .btn-update {\n",
       "    position: absolute;\n",
       "    float: right;\n",
       "    right: 11px;\n",
       "    top: 4px;\n",
       "    font-size: 9pt;\n",
       "}\n",
       "\n",
       "</style>\n",
       "<script>\n",
       "    MathJax.Hub.Config({\n",
       "                        TeX: {\n",
       "                           extensions: [\"AMSmath.js\"]\n",
       "                           },\n",
       "                displayAlign: 'center', // Change this to 'center' to center equations.\n",
       "                \"HTML-CSS\": {\n",
       "                    styles: {'.MathJax_Display': {\"margin\": 4}}\n",
       "                }\n",
       "        });\n",
       "</script>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import re\n",
    "import os \n",
    "import scipy.io\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import sqlite3\n",
    "#from pandas.io.data import DataFrame, read_csv\n",
    "import pandas.io.sql as psql\n",
    "from time import time\n",
    "from IPython.display import display\n",
    "from addutils import css_notebook\n",
    "css_notebook()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import sys"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import from '.mat' files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "x = scipy.io.loadmat('example_data/matlab_variable.mat')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 92,  99,   1,   8,  15,  67,  74,  51,  58,  40],\n",
       "       [ 98,  80,   7,  14,  16,  73,  55,  57,  64,  41],\n",
       "       [  4,  81,  88,  20,  22,  54,  56,  63,  70,  47],\n",
       "       [ 85,  87,  19,  21,   3,  60,  62,  69,  71,  28],\n",
       "       [ 86,  93,  25,   2,   9,  61,  68,  75,  52,  34],\n",
       "       [ 17,  24,  76,  83,  90,  42,  49,  26,  33,  65],\n",
       "       [ 23,   5,  82,  89,  91,  48,  30,  32,  39,  66],\n",
       "       [ 79,   6,  13,  95,  97,  29,  31,  38,  45,  72],\n",
       "       [ 10,  12,  94,  96,  78,  35,  37,  44,  46,  53],\n",
       "       [ 11,  18, 100,  77,  84,  36,  43,  50,  27,  59]], dtype=uint8)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pyA = x['a']\n",
    "pyA"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The Matlab variable is passed to a pandas DataFrame:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>92</td>\n",
       "      <td>99</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "      <td>15</td>\n",
       "      <td>67</td>\n",
       "      <td>74</td>\n",
       "      <td>51</td>\n",
       "      <td>58</td>\n",
       "      <td>40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>98</td>\n",
       "      <td>80</td>\n",
       "      <td>7</td>\n",
       "      <td>14</td>\n",
       "      <td>16</td>\n",
       "      <td>73</td>\n",
       "      <td>55</td>\n",
       "      <td>57</td>\n",
       "      <td>64</td>\n",
       "      <td>41</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4</td>\n",
       "      <td>81</td>\n",
       "      <td>88</td>\n",
       "      <td>20</td>\n",
       "      <td>22</td>\n",
       "      <td>54</td>\n",
       "      <td>56</td>\n",
       "      <td>63</td>\n",
       "      <td>70</td>\n",
       "      <td>47</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>85</td>\n",
       "      <td>87</td>\n",
       "      <td>19</td>\n",
       "      <td>21</td>\n",
       "      <td>3</td>\n",
       "      <td>60</td>\n",
       "      <td>62</td>\n",
       "      <td>69</td>\n",
       "      <td>71</td>\n",
       "      <td>28</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>86</td>\n",
       "      <td>93</td>\n",
       "      <td>25</td>\n",
       "      <td>2</td>\n",
       "      <td>9</td>\n",
       "      <td>61</td>\n",
       "      <td>68</td>\n",
       "      <td>75</td>\n",
       "      <td>52</td>\n",
       "      <td>34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>17</td>\n",
       "      <td>24</td>\n",
       "      <td>76</td>\n",
       "      <td>83</td>\n",
       "      <td>90</td>\n",
       "      <td>42</td>\n",
       "      <td>49</td>\n",
       "      <td>26</td>\n",
       "      <td>33</td>\n",
       "      <td>65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>23</td>\n",
       "      <td>5</td>\n",
       "      <td>82</td>\n",
       "      <td>89</td>\n",
       "      <td>91</td>\n",
       "      <td>48</td>\n",
       "      <td>30</td>\n",
       "      <td>32</td>\n",
       "      <td>39</td>\n",
       "      <td>66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>79</td>\n",
       "      <td>6</td>\n",
       "      <td>13</td>\n",
       "      <td>95</td>\n",
       "      <td>97</td>\n",
       "      <td>29</td>\n",
       "      <td>31</td>\n",
       "      <td>38</td>\n",
       "      <td>45</td>\n",
       "      <td>72</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>10</td>\n",
       "      <td>12</td>\n",
       "      <td>94</td>\n",
       "      <td>96</td>\n",
       "      <td>78</td>\n",
       "      <td>35</td>\n",
       "      <td>37</td>\n",
       "      <td>44</td>\n",
       "      <td>46</td>\n",
       "      <td>53</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>11</td>\n",
       "      <td>18</td>\n",
       "      <td>100</td>\n",
       "      <td>77</td>\n",
       "      <td>84</td>\n",
       "      <td>36</td>\n",
       "      <td>43</td>\n",
       "      <td>50</td>\n",
       "      <td>27</td>\n",
       "      <td>59</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    0   1    2   3   4   5   6   7   8   9\n",
       "0  92  99    1   8  15  67  74  51  58  40\n",
       "1  98  80    7  14  16  73  55  57  64  41\n",
       "2   4  81   88  20  22  54  56  63  70  47\n",
       "3  85  87   19  21   3  60  62  69  71  28\n",
       "4  86  93   25   2   9  61  68  75  52  34\n",
       "5  17  24   76  83  90  42  49  26  33  65\n",
       "6  23   5   82  89  91  48  30  32  39  66\n",
       "7  79   6   13  95  97  29  31  38  45  72\n",
       "8  10  12   94  96  78  35  37  44  46  53\n",
       "9  11  18  100  77  84  36  43  50  27  59"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(pyA)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2 Importing a compressed CSV"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The following example shows how to import directly a compressed csv file, in this case with multiple separators:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Names</th>\n",
       "      <th>Phone</th>\n",
       "      <th>Town</th>\n",
       "      <th>Description</th>\n",
       "      <th>Income</th>\n",
       "      <th>Coordinates</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Lydia Z. Flores</td>\n",
       "      <td>(02) 0548 5995</td>\n",
       "      <td>Haddington</td>\n",
       "      <td>sed, sapien. Nunc pulvinar arcu</td>\n",
       "      <td>â¬6,223</td>\n",
       "      <td>23.28054, -24.48755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Bell X. Guerra</td>\n",
       "      <td>(07) 8599 9926</td>\n",
       "      <td>Montes Claros</td>\n",
       "      <td>consectetuer, cursus et, magna. Praesent</td>\n",
       "      <td>â¬14,967</td>\n",
       "      <td>-77.76329, 69.22339</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Kirk Q. Bowman</td>\n",
       "      <td>(06) 4153 7501</td>\n",
       "      <td>Cagli</td>\n",
       "      <td>tortor. Nunc commodo auctor velit.</td>\n",
       "      <td>â¬17,399</td>\n",
       "      <td>-58.80037, 22.50537</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Karen Pickett</td>\n",
       "      <td>(02) 3216 9708</td>\n",
       "      <td>Cobourg</td>\n",
       "      <td>at auctor ullamcorper, nisl arcu</td>\n",
       "      <td>â¬17,373</td>\n",
       "      <td>75.73982, -78.01872</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Vance J. Johnson</td>\n",
       "      <td>(01) 7568 6371</td>\n",
       "      <td>Carlton</td>\n",
       "      <td>ultricies adipiscing, enim mi tempor</td>\n",
       "      <td>â¬9,025</td>\n",
       "      <td>-86.51337, 109.46298</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ID             Names               Phone              Town  \\\n",
       "0   1   Lydia Z. Flores      (02) 0548 5995        Haddington   \n",
       "1   2    Bell X. Guerra      (07) 8599 9926     Montes Claros   \n",
       "2   3    Kirk Q. Bowman      (06) 4153 7501             Cagli   \n",
       "3   4     Karen Pickett      (02) 3216 9708           Cobourg   \n",
       "4   5  Vance J. Johnson      (01) 7568 6371           Carlton   \n",
       "\n",
       "                                     Description                      Income  \\\n",
       "0                sed, sapien. Nunc pulvinar arcu                    â¬6,223   \n",
       "1       consectetuer, cursus et, magna. Praesent                   â¬14,967   \n",
       "2             tortor. Nunc commodo auctor velit.                   â¬17,399   \n",
       "3               at auctor ullamcorper, nisl arcu                   â¬17,373   \n",
       "4           ultricies adipiscing, enim mi tempor                    â¬9,025   \n",
       "\n",
       "            Coordinates  \n",
       "0   23.28054, -24.48755  \n",
       "1   -77.76329, 69.22339  \n",
       "2   -58.80037, 22.50537  \n",
       "3   75.73982, -78.01872  \n",
       "4  -86.51337, 109.46298  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_csv = pd.read_csv('example_data/pd12_peopl.csv.gz', sep=r'\\,\\;\\.',  \n",
    "                     skipinitialspace=True, compression='gzip', engine='python',\n",
    "                     encoding='iso8859_15')\n",
    "df_csv.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**TODO**: utf-8 has problems decoding euro sign. \"\\u00e2\\u0082\\u00ac\" are the three chars you get when the UTF-8 encoded \\u20ac (EURO SIGN) is mistakenly interpreted as ISO-8859-1."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Names</th>\n",
       "      <th>Phone</th>\n",
       "      <th>Town</th>\n",
       "      <th>Description</th>\n",
       "      <th>Income</th>\n",
       "      <th>Coordinates</th>\n",
       "      <th>lat</th>\n",
       "      <th>lon</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Lydia Z. Flores</td>\n",
       "      <td>(02) 0548 5995</td>\n",
       "      <td>Haddington</td>\n",
       "      <td>sed, sapien. Nunc pulvinar arcu</td>\n",
       "      <td>â¬6,223</td>\n",
       "      <td>23.28054, -24.48755</td>\n",
       "      <td>23.28054</td>\n",
       "      <td>-24.48755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Bell X. Guerra</td>\n",
       "      <td>(07) 8599 9926</td>\n",
       "      <td>Montes Claros</td>\n",
       "      <td>consectetuer, cursus et, magna. Praesent</td>\n",
       "      <td>â¬14,967</td>\n",
       "      <td>-77.76329, 69.22339</td>\n",
       "      <td>-77.76329</td>\n",
       "      <td>69.22339</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Kirk Q. Bowman</td>\n",
       "      <td>(06) 4153 7501</td>\n",
       "      <td>Cagli</td>\n",
       "      <td>tortor. Nunc commodo auctor velit.</td>\n",
       "      <td>â¬17,399</td>\n",
       "      <td>-58.80037, 22.50537</td>\n",
       "      <td>-58.80037</td>\n",
       "      <td>22.50537</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Karen Pickett</td>\n",
       "      <td>(02) 3216 9708</td>\n",
       "      <td>Cobourg</td>\n",
       "      <td>at auctor ullamcorper, nisl arcu</td>\n",
       "      <td>â¬17,373</td>\n",
       "      <td>75.73982, -78.01872</td>\n",
       "      <td>75.73982</td>\n",
       "      <td>-78.01872</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Vance J. Johnson</td>\n",
       "      <td>(01) 7568 6371</td>\n",
       "      <td>Carlton</td>\n",
       "      <td>ultricies adipiscing, enim mi tempor</td>\n",
       "      <td>â¬9,025</td>\n",
       "      <td>-86.51337, 109.46298</td>\n",
       "      <td>-86.51337</td>\n",
       "      <td>109.46298</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   ID             Names               Phone              Town  \\\n",
       "0   1   Lydia Z. Flores      (02) 0548 5995        Haddington   \n",
       "1   2    Bell X. Guerra      (07) 8599 9926     Montes Claros   \n",
       "2   3    Kirk Q. Bowman      (06) 4153 7501             Cagli   \n",
       "3   4     Karen Pickett      (02) 3216 9708           Cobourg   \n",
       "4   5  Vance J. Johnson      (01) 7568 6371           Carlton   \n",
       "\n",
       "                                     Description                      Income  \\\n",
       "0                sed, sapien. Nunc pulvinar arcu                    â¬6,223   \n",
       "1       consectetuer, cursus et, magna. Praesent                   â¬14,967   \n",
       "2             tortor. Nunc commodo auctor velit.                   â¬17,399   \n",
       "3               at auctor ullamcorper, nisl arcu                   â¬17,373   \n",
       "4           ultricies adipiscing, enim mi tempor                    â¬9,025   \n",
       "\n",
       "            Coordinates        lat         lon  \n",
       "0   23.28054, -24.48755   23.28054   -24.48755  \n",
       "1   -77.76329, 69.22339  -77.76329    69.22339  \n",
       "2   -58.80037, 22.50537  -58.80037    22.50537  \n",
       "3   75.73982, -78.01872   75.73982   -78.01872  \n",
       "4  -86.51337, 109.46298  -86.51337   109.46298  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "coor = df_csv['Coordinates']\n",
    "df_csv['lat'] = ''\n",
    "df_csv['lon'] = ''\n",
    "for j, coo in enumerate(coor):\n",
    "    spl = re.split(',', str(coo))\n",
    "    df_csv.loc[j, 'lat'] = spl[0] \n",
    "    df_csv.loc[j, 'lon'] = spl[1]\n",
    "df_csv.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df_csv.to_json('temp/converted_json.json')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3 Importing JSON files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Coordinates</th>\n",
       "      <th>Description</th>\n",
       "      <th>ID</th>\n",
       "      <th>Income</th>\n",
       "      <th>Names</th>\n",
       "      <th>Phone</th>\n",
       "      <th>Town</th>\n",
       "      <th>lat</th>\n",
       "      <th>lon</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>23.28054, -24.48755</td>\n",
       "      <td>sed, sapien. Nunc pulvinar arcu</td>\n",
       "      <td>1</td>\n",
       "      <td>â¬6,223</td>\n",
       "      <td>Lydia Z. Flores</td>\n",
       "      <td>(02) 0548 5995</td>\n",
       "      <td>Haddington</td>\n",
       "      <td>23.28054</td>\n",
       "      <td>-24.48755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-77.76329, 69.22339</td>\n",
       "      <td>consectetuer, cursus et, magna. Praesent</td>\n",
       "      <td>2</td>\n",
       "      <td>â¬14,967</td>\n",
       "      <td>Bell X. Guerra</td>\n",
       "      <td>(07) 8599 9926</td>\n",
       "      <td>Montes Claros</td>\n",
       "      <td>-77.76329</td>\n",
       "      <td>69.22339</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>-62.95199, 116.29751</td>\n",
       "      <td>Duis sit amet diam eu</td>\n",
       "      <td>11</td>\n",
       "      <td>â¬15,764</td>\n",
       "      <td>Jameson H. Craig</td>\n",
       "      <td>(07) 8640 5274</td>\n",
       "      <td>Fairbanks</td>\n",
       "      <td>-62.95199</td>\n",
       "      <td>116.29751</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>-42.26793, -92.23605</td>\n",
       "      <td>et netus et malesuada fames</td>\n",
       "      <td>12</td>\n",
       "      <td>â¬18,460</td>\n",
       "      <td>Michael Randall</td>\n",
       "      <td>(04) 3451 8606</td>\n",
       "      <td>Mezzana</td>\n",
       "      <td>-42.26793</td>\n",
       "      <td>-92.23605</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>-20.39897, 152.4122</td>\n",
       "      <td>id risus quis diam luctus</td>\n",
       "      <td>13</td>\n",
       "      <td>â¬6,175</td>\n",
       "      <td>Sierra Rivers</td>\n",
       "      <td>(01) 2020 4511</td>\n",
       "      <td>Lugnano in Teverina</td>\n",
       "      <td>-20.39897</td>\n",
       "      <td>152.41220</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             Coordinates                                    Description  ID  \\\n",
       "0    23.28054, -24.48755                sed, sapien. Nunc pulvinar arcu   1   \n",
       "1    -77.76329, 69.22339       consectetuer, cursus et, magna. Praesent   2   \n",
       "10  -62.95199, 116.29751                          Duis sit amet diam eu  11   \n",
       "11  -42.26793, -92.23605                    et netus et malesuada fames  12   \n",
       "12   -20.39897, 152.4122                      id risus quis diam luctus  13   \n",
       "\n",
       "                        Income             Names               Phone  \\\n",
       "0                     â¬6,223   Lydia Z. Flores      (02) 0548 5995   \n",
       "1                    â¬14,967    Bell X. Guerra      (07) 8599 9926   \n",
       "10                   â¬15,764  Jameson H. Craig      (07) 8640 5274   \n",
       "11                   â¬18,460   Michael Randall      (04) 3451 8606   \n",
       "12                    â¬6,175     Sierra Rivers      (01) 2020 4511   \n",
       "\n",
       "                   Town       lat        lon  \n",
       "0            Haddington  23.28054  -24.48755  \n",
       "1         Montes Claros -77.76329   69.22339  \n",
       "10            Fairbanks -62.95199  116.29751  \n",
       "11              Mezzana -42.26793  -92.23605  \n",
       "12  Lugnano in Teverina -20.39897  152.41220  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('temp/converted_json.json') as f:\n",
    "    data = f.read().encode('utf-8')\n",
    "dfjson = pd.read_json(data)\n",
    "dfjson.head()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Since the import reordered columns in alphabetical order, we can choose a preferred column order:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>Names</th>\n",
       "      <th>Phone</th>\n",
       "      <th>Income</th>\n",
       "      <th>Town</th>\n",
       "      <th>lat</th>\n",
       "      <th>lon</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Lydia Z. Flores</td>\n",
       "      <td>(02) 0548 5995</td>\n",
       "      <td>â¬6,223</td>\n",
       "      <td>Haddington</td>\n",
       "      <td>23.28054</td>\n",
       "      <td>-24.48755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Bell X. Guerra</td>\n",
       "      <td>(07) 8599 9926</td>\n",
       "      <td>â¬14,967</td>\n",
       "      <td>Montes Claros</td>\n",
       "      <td>-77.76329</td>\n",
       "      <td>69.22339</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>11</td>\n",
       "      <td>Jameson H. Craig</td>\n",
       "      <td>(07) 8640 5274</td>\n",
       "      <td>â¬15,764</td>\n",
       "      <td>Fairbanks</td>\n",
       "      <td>-62.95199</td>\n",
       "      <td>116.29751</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>12</td>\n",
       "      <td>Michael Randall</td>\n",
       "      <td>(04) 3451 8606</td>\n",
       "      <td>â¬18,460</td>\n",
       "      <td>Mezzana</td>\n",
       "      <td>-42.26793</td>\n",
       "      <td>-92.23605</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>13</td>\n",
       "      <td>Sierra Rivers</td>\n",
       "      <td>(01) 2020 4511</td>\n",
       "      <td>â¬6,175</td>\n",
       "      <td>Lugnano in Teverina</td>\n",
       "      <td>-20.39897</td>\n",
       "      <td>152.41220</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    ID             Names               Phone                      Income  \\\n",
       "0    1   Lydia Z. Flores      (02) 0548 5995                    â¬6,223   \n",
       "1    2    Bell X. Guerra      (07) 8599 9926                   â¬14,967   \n",
       "10  11  Jameson H. Craig      (07) 8640 5274                   â¬15,764   \n",
       "11  12   Michael Randall      (04) 3451 8606                   â¬18,460   \n",
       "12  13     Sierra Rivers      (01) 2020 4511                    â¬6,175   \n",
       "\n",
       "                   Town       lat        lon  \n",
       "0            Haddington  23.28054  -24.48755  \n",
       "1         Montes Claros -77.76329   69.22339  \n",
       "10            Fairbanks -62.95199  116.29751  \n",
       "11              Mezzana -42.26793  -92.23605  \n",
       "12  Lugnano in Teverina -20.39897  152.41220  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfjson = dfjson.ix[:, ['ID', 'Names', 'Phone', 'Income', 'Town', 'lat', 'lon']]\n",
    "dfjson.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4 Importing HTML"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note: `read_html` returns a **list** of DataFrame objects, even if there is only a single table contained in the `HTML` content. Infer_types avoids the function trying to automatically detect numeric and date types (this generated an error with coordinates)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Nomi maschili</th>\n",
       "      <th>Strade</th>\n",
       "      <th>Città</th>\n",
       "      <th>Coordinate</th>\n",
       "      <th>Ente</th>\n",
       "      <th>mail</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Drew</td>\n",
       "      <td>5672 Accumsan Road</td>\n",
       "      <td>Whitburn</td>\n",
       "      <td>-54.68429, -67.21709</td>\n",
       "      <td>Ac Consulting</td>\n",
       "      <td>a.aliquet.vel@vitae.co.uk</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Finn</td>\n",
       "      <td>Ap #170-4074 Interdum Rd.</td>\n",
       "      <td>Pelotas</td>\n",
       "      <td>81.93087, 168.14556</td>\n",
       "      <td>Varius Incorporated</td>\n",
       "      <td>nec.malesuada.ut@primisin.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Martin</td>\n",
       "      <td>Ap #471-6260 Etiam Ave</td>\n",
       "      <td>Aliano</td>\n",
       "      <td>33.99272, -148.70584</td>\n",
       "      <td>Arcu Industries</td>\n",
       "      <td>lacus@risus.net</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Zephania</td>\n",
       "      <td>Ap #720-7951 Aliquam Ave</td>\n",
       "      <td>Senneville</td>\n",
       "      <td>-56.82569, -72.80435</td>\n",
       "      <td>Tristique PC</td>\n",
       "      <td>arcu@sempereratin.co.uk</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Devin</td>\n",
       "      <td>3538 Dui St.</td>\n",
       "      <td>Zwettl-Niederösterreich</td>\n",
       "      <td>-11.90874, -149.61123</td>\n",
       "      <td>Imperdiet Dictum PC</td>\n",
       "      <td>lectus.rutrum.urna@massaQuisqueporttitor.ca</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Nomi maschili                     Strade                    Città  \\\n",
       "0          Drew         5672 Accumsan Road                 Whitburn   \n",
       "1          Finn  Ap #170-4074 Interdum Rd.                  Pelotas   \n",
       "2        Martin     Ap #471-6260 Etiam Ave                   Aliano   \n",
       "3      Zephania   Ap #720-7951 Aliquam Ave               Senneville   \n",
       "4         Devin               3538 Dui St.  Zwettl-Niederösterreich   \n",
       "\n",
       "              Coordinate                 Ente  \\\n",
       "0   -54.68429, -67.21709        Ac Consulting   \n",
       "1    81.93087, 168.14556  Varius Incorporated   \n",
       "2   33.99272, -148.70584      Arcu Industries   \n",
       "3   -56.82569, -72.80435         Tristique PC   \n",
       "4  -11.90874, -149.61123  Imperdiet Dictum PC   \n",
       "\n",
       "                                          mail  \n",
       "0                    a.aliquet.vel@vitae.co.uk  \n",
       "1                nec.malesuada.ut@primisin.com  \n",
       "2                              lacus@risus.net  \n",
       "3                      arcu@sempereratin.co.uk  \n",
       "4  lectus.rutrum.urna@massaQuisqueporttitor.ca  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfhtml = pd.read_html('example_data/generated.html', header=0)\n",
    "dfhtml[0].head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5 Importing Excel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Names</th>\n",
       "      <th>Streets</th>\n",
       "      <th>Town</th>\n",
       "      <th>Coordinates</th>\n",
       "      <th>Corporation</th>\n",
       "      <th>mail</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Stephen</td>\n",
       "      <td>Ap #389-3365 Risus, St.</td>\n",
       "      <td>Lelystad</td>\n",
       "      <td>51.91783, -47.01037</td>\n",
       "      <td>Consectetuer Rhoncus Nullam Corporation</td>\n",
       "      <td>Suspendisse.sed@pedePraesenteu.org</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Kenneth</td>\n",
       "      <td>120-6483 Ligula. Ave</td>\n",
       "      <td>Labrecque</td>\n",
       "      <td>-1.92625, 10.02451</td>\n",
       "      <td>A Corporation</td>\n",
       "      <td>orci@egetmollislectus.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Leo</td>\n",
       "      <td>Ap #727-2085 Eget Av.</td>\n",
       "      <td>Strathcona County</td>\n",
       "      <td>-17.31839, 137.99307</td>\n",
       "      <td>Sed Limited</td>\n",
       "      <td>Etiam@vel.org</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Joshua</td>\n",
       "      <td>P.O. Box 425, 6462 Arcu Rd.</td>\n",
       "      <td>Municipal District</td>\n",
       "      <td>-51.34642, 80.32145</td>\n",
       "      <td>Mauris Sapien Cursus Corp.</td>\n",
       "      <td>lorem@ornarelectusante.net</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Zephania</td>\n",
       "      <td>655 Et, St.</td>\n",
       "      <td>Couillet</td>\n",
       "      <td>29.96525, 124.18391</td>\n",
       "      <td>Odio Semper Cursus Corp.</td>\n",
       "      <td>metus@a.ca</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Names                      Streets                Town  \\\n",
       "0   Stephen      Ap #389-3365 Risus, St.            Lelystad   \n",
       "1   Kenneth         120-6483 Ligula. Ave           Labrecque   \n",
       "2       Leo        Ap #727-2085 Eget Av.   Strathcona County   \n",
       "3    Joshua  P.O. Box 425, 6462 Arcu Rd.  Municipal District   \n",
       "4  Zephania                  655 Et, St.            Couillet   \n",
       "\n",
       "            Coordinates                              Corporation  \\\n",
       "0   51.91783, -47.01037  Consectetuer Rhoncus Nullam Corporation   \n",
       "1    -1.92625, 10.02451                            A Corporation   \n",
       "2  -17.31839, 137.99307                              Sed Limited   \n",
       "3   -51.34642, 80.32145               Mauris Sapien Cursus Corp.   \n",
       "4   29.96525, 124.18391                 Odio Semper Cursus Corp.   \n",
       "\n",
       "                                 mail  \n",
       "0  Suspendisse.sed@pedePraesenteu.org  \n",
       "1           orci@egetmollislectus.com  \n",
       "2                       Etiam@vel.org  \n",
       "3          lorem@ornarelectusante.net  \n",
       "4                          metus@a.ca  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dfxl = pd.read_excel('example_data/generated2.xls', 'foglio')\n",
    "dfxl.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6 Working with SQL and databases"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.1 Write SQL"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's store the DataFrame opened from excel in a database. We use SQLite, a database engine library suitable for storing data in a single-file database. 'Names' is the name we chose for the database table we are creating:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "con = sqlite3.connect(\"temp.sql\")\n",
    "sqlfile = dfxl.to_sql('Names', con, flavor='sqlite')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 6.2 Import SQL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(100, 7)\n"
     ]
    }
   ],
   "source": [
    "con = sqlite3.connect('temp.sql')\n",
    "with con:\n",
    "    sql = \"SELECT * FROM Names;\"\n",
    "    df = psql.read_sql(sql, con)\n",
    "    print(df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>Names</th>\n",
       "      <th>Streets</th>\n",
       "      <th>Town</th>\n",
       "      <th>Coordinates</th>\n",
       "      <th>Corporation</th>\n",
       "      <th>mail</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>Stephen</td>\n",
       "      <td>Ap #389-3365 Risus, St.</td>\n",
       "      <td>Lelystad</td>\n",
       "      <td>51.91783, -47.01037</td>\n",
       "      <td>Consectetuer Rhoncus Nullam Corporation</td>\n",
       "      <td>Suspendisse.sed@pedePraesenteu.org</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>Kenneth</td>\n",
       "      <td>120-6483 Ligula. Ave</td>\n",
       "      <td>Labrecque</td>\n",
       "      <td>-1.92625, 10.02451</td>\n",
       "      <td>A Corporation</td>\n",
       "      <td>orci@egetmollislectus.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>Leo</td>\n",
       "      <td>Ap #727-2085 Eget Av.</td>\n",
       "      <td>Strathcona County</td>\n",
       "      <td>-17.31839, 137.99307</td>\n",
       "      <td>Sed Limited</td>\n",
       "      <td>Etiam@vel.org</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>Joshua</td>\n",
       "      <td>P.O. Box 425, 6462 Arcu Rd.</td>\n",
       "      <td>Municipal District</td>\n",
       "      <td>-51.34642, 80.32145</td>\n",
       "      <td>Mauris Sapien Cursus Corp.</td>\n",
       "      <td>lorem@ornarelectusante.net</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>Zephania</td>\n",
       "      <td>655 Et, St.</td>\n",
       "      <td>Couillet</td>\n",
       "      <td>29.96525, 124.18391</td>\n",
       "      <td>Odio Semper Cursus Corp.</td>\n",
       "      <td>metus@a.ca</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   index     Names                      Streets                Town  \\\n",
       "0      0   Stephen      Ap #389-3365 Risus, St.            Lelystad   \n",
       "1      1   Kenneth         120-6483 Ligula. Ave           Labrecque   \n",
       "2      2       Leo        Ap #727-2085 Eget Av.   Strathcona County   \n",
       "3      3    Joshua  P.O. Box 425, 6462 Arcu Rd.  Municipal District   \n",
       "4      4  Zephania                  655 Et, St.            Couillet   \n",
       "\n",
       "            Coordinates                              Corporation  \\\n",
       "0   51.91783, -47.01037  Consectetuer Rhoncus Nullam Corporation   \n",
       "1    -1.92625, 10.02451                            A Corporation   \n",
       "2  -17.31839, 137.99307                              Sed Limited   \n",
       "3   -51.34642, 80.32145               Mauris Sapien Cursus Corp.   \n",
       "4   29.96525, 124.18391                 Odio Semper Cursus Corp.   \n",
       "\n",
       "                                 mail  \n",
       "0  Suspendisse.sed@pedePraesenteu.org  \n",
       "1           orci@egetmollislectus.com  \n",
       "2                       Etiam@vel.org  \n",
       "3          lorem@ornarelectusante.net  \n",
       "4                          metus@a.ca  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "con.close()\n",
    "os.remove(\"temp.sql\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7 Working with HDF5"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 7.1 Storer format"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**HDFStore** is a dict-like object used by pandas to store datasets as **HDF5** files using the **PyTables** library. **HDF5** is a scientific hierarchical data format suitable for storing in a file very large and multi-dimensional data arrays. The **Storer** format stores fixed arrays, which are queryiable and must be retrieved in their entirety."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Add DataFrames to the HDFStore object:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "samples_01 = int(3e5)\n",
    "samples_02 = int(1e5)\n",
    "idx1 = pd.date_range('1/1/2000 12:00:00', periods=samples_01, freq='50ms', tz='Europe/Rome')\n",
    "idx2 = pd.date_range('1/1/2000 12:05:00', periods=samples_02, freq='100ms', tz='Europe/Rome')\n",
    "randn = np.random.randn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df1 = pd.DataFrame(randn(samples_01, 3), index=idx1, columns=['A', 'B', 'C'])\n",
    "df2 = pd.DataFrame(randn(samples_02, 4), index=idx2, columns=['A', 'B', 'C', 'D'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Size of the Dataset:  13.73291015625  MB\n"
     ]
    }
   ],
   "source": [
    "print ('Size of the Dataset: ', (df1.values.nbytes+df1.values.nbytes)/2**20, ' MB')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10 loops, best of 3: 32 ms per loop\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "with pd.get_store('temp/store53.h5') as store:\n",
    "    store.put('storer/df1', df1)\n",
    "    store.put('storer/df2', df2)\n",
    "    store.put('to_remove', df2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Retrieve stored objects:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['/to_remove', '/storer/df1', '/storer/df2']\n",
      "                                         A         B         C\n",
      "2000-01-01 12:00:00.050000+01:00 -0.022728  0.015469  0.962709\n",
      "2000-01-01 12:00:00.100000+01:00 -0.453089  0.513491 -1.270842\n",
      "                                         A         B         C\n",
      "2000-01-01 12:00:00.050000+01:00 -0.022728  0.015469  0.962709\n",
      "2000-01-01 12:00:00.100000+01:00 -0.453089  0.513491 -1.270842\n",
      "Check retrieved data equal to original data: \n",
      "True\n"
     ]
    }
   ],
   "source": [
    "with pd.get_store('temp/store53.h5') as store:\n",
    "    print (store.keys())\n",
    "    df1_retrieved = store.get('storer/df1')\n",
    "    print (df1_retrieved[1:3])\n",
    "    print (df1[1:3])\n",
    "    print ('Check retrieved data equal to original data: ')\n",
    "    print (df1_retrieved[1:3].equals(df1[1:3]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Delete objects:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.io.pytables.HDFStore'>\n",
      "File path: temp/store53.h5\n",
      "/storer/df1            frame        (shape->[300000,3])\n",
      "/storer/df2            frame        (shape->[100000,4])\n"
     ]
    }
   ],
   "source": [
    "with pd.get_store('temp/store53.h5') as store:\n",
    "    try:\n",
    "        store.remove('to_remove')\n",
    "    except:\n",
    "        pass\n",
    "    print (store)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 7.2 Table format"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The table format conceptually is shaped very much like a DataFrame and may be appended to in the same or other sessions. In addition, delete & query type operations are supported."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with pd.get_store('temp/store53.h5') as store:\n",
    "    # store.append creates a table automatically:\n",
    "    store.append('table/df1_appended', df1.ix[:10000])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with pd.get_store('temp/store53.h5') as store:\n",
    "    store.append('table/df1_appended', df1.ix[10001:20000])\n",
    "    store.append('table/df1_appended', df1.ix[20001:50000])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 7.3 Querying a Table"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Query the table using boolean expression with in-line function evaluation:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>A</th>\n",
       "      <th>B</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2000-01-01 12:00:00.200000+01:00</th>\n",
       "      <td>-0.232597</td>\n",
       "      <td>-0.705741</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 12:00:00.250000+01:00</th>\n",
       "      <td>0.183234</td>\n",
       "      <td>0.024464</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 12:00:00.300000+01:00</th>\n",
       "      <td>-0.801220</td>\n",
       "      <td>-0.732630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-01 12:00:00.350000+01:00</th>\n",
       "      <td>0.762667</td>\n",
       "      <td>-0.585462</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                         A         B\n",
       "2000-01-01 12:00:00.200000+01:00 -0.232597 -0.705741\n",
       "2000-01-01 12:00:00.250000+01:00  0.183234  0.024464\n",
       "2000-01-01 12:00:00.300000+01:00 -0.801220 -0.732630\n",
       "2000-01-01 12:00:00.350000+01:00  0.762667 -0.585462"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with pd.get_store('temp/store53.h5') as store:\n",
    "    query01 = store.select('table/df1_appended',\n",
    "                           \"index>=Timestamp('2000-01-01 12:00:00.20+01:00') \\\n",
    "                           & index<Timestamp('2000-01-01 12:00:00.40+01:00') \\\n",
    "                           & columns=['A', 'B']\")\n",
    "query01"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "\n",
    "Visit [www.add-for.com](<http://www.add-for.com/IT>) for more tutorials and updates.\n",
    "\n",
    "This work is licensed under a <a rel=\"license\" href=\"http://creativecommons.org/licenses/by-sa/4.0/\">Creative Commons Attribution-ShareAlike 4.0 International License</a>."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}