{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# import\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Parsing functions in pandas\n",
    "```\n",
    "read_csv              Load delimited data from a file, URL, or file-like object. Use comma as default delimiter\n",
    "read_table            Load delimited data from a file, URL, or file-like object. Use tab ('\\t') as default delimiter\n",
    "read_fwf              Read data in fixed-width column format (that is, no delimiters)\n",
    "read_clipboard      Version of read_table that reads data from the clipboard. Useful for converting tables from web pages\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LNAME,FNAME,JOB TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY,ESTIMATED ANNUAL SALARY MINUS FURLOUGHS\n",
      "AARON,ELVIA J,WATER RATE TAKER,WATER MGMNT,$81000.00,$73862.00\n",
      "AARON,JEFFERY M,POLICE OFFICER,POLICE,$74628.00,$74628.00\n",
      "AARON,KIMBERLEI R,CHIEF CONTRACT EXPEDITER,FLEET MANAGEMNT,$77280.00,$70174.00\n",
      "ABAD JR,VICENTE M,CIVIL ENGINEER IV,WATER MGMNT,$96276.00,$96276.00\n",
      "ABBATACOLA,ROBERT J,ELECTRICAL MECHANIC,WATER MGMNT,$84032.00,$76627.00\n",
      "ABBATE,TERRY M,POLICE OFFICER,POLICE,$79926.00,$79926.00\n",
      "ABBATEMARCO,JAMES J,FIREFIGHTER,FIRE,$77238.00,$77238.00\n",
      "ABBOTT,BETTY L,FOSTER GRANDPARENT,FAMILY & SUPPORT,$2756.00,$2756.00\n",
      "ABBOTT,LYNISE M,CLERK III,FAMILY & SUPPORT,$38568.00,$38568.00\n",
      "ABBOTT,SAM J,ELECTRICAL MECHANIC,TRANSPORTN,$84032.00,$76627.00\n",
      "ABDELHADI,ABDALMAHD,POLICE OFFICER,POLICE,$71040.00,$71040.00\n",
      "ABDELLATIF,AREF R,FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC,FIRE,$86922.00,$86922.00\n"
     ]
    }
   ],
   "source": [
    "# reading csv file, first check the content of it\n",
    "!more \"dataset/Employees2Head.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# this file is having header and delimiter is comma (,), we can read this with read_csv method\n",
    "fh = pd.read_csv(\"dataset/Employees2Head.csv\")\n",
    "len(fh)   # of lines in file excluding header"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "          LNAME        FNAME                                      JOB TITLE  \\\n",
      "0         AARON      ELVIA J                               WATER RATE TAKER   \n",
      "1         AARON    JEFFERY M                                 POLICE OFFICER   \n",
      "2         AARON  KIMBERLEI R                       CHIEF CONTRACT EXPEDITER   \n",
      "3       ABAD JR    VICENTE M                              CIVIL ENGINEER IV   \n",
      "4    ABBATACOLA     ROBERT J                            ELECTRICAL MECHANIC   \n",
      "5        ABBATE      TERRY M                                 POLICE OFFICER   \n",
      "6   ABBATEMARCO      JAMES J                                    FIREFIGHTER   \n",
      "7        ABBOTT      BETTY L                             FOSTER GRANDPARENT   \n",
      "8        ABBOTT     LYNISE M                                      CLERK III   \n",
      "9        ABBOTT        SAM J                            ELECTRICAL MECHANIC   \n",
      "10    ABDELHADI    ABDALMAHD                                 POLICE OFFICER   \n",
      "11   ABDELLATIF       AREF R  FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC   \n",
      "\n",
      "          DEPARTMENT EMPLOYEE ANNUAL SALARY  \\\n",
      "0        WATER MGMNT              $81000.00   \n",
      "1             POLICE              $74628.00   \n",
      "2    FLEET MANAGEMNT              $77280.00   \n",
      "3        WATER MGMNT              $96276.00   \n",
      "4        WATER MGMNT              $84032.00   \n",
      "5             POLICE              $79926.00   \n",
      "6               FIRE              $77238.00   \n",
      "7   FAMILY & SUPPORT               $2756.00   \n",
      "8   FAMILY & SUPPORT              $38568.00   \n",
      "9         TRANSPORTN              $84032.00   \n",
      "10            POLICE              $71040.00   \n",
      "11              FIRE              $86922.00   \n",
      "\n",
      "   ESTIMATED ANNUAL SALARY MINUS FURLOUGHS  \n",
      "0                                $73862.00  \n",
      "1                                $74628.00  \n",
      "2                                $70174.00  \n",
      "3                                $96276.00  \n",
      "4                                $76627.00  \n",
      "5                                $79926.00  \n",
      "6                                $77238.00  \n",
      "7                                 $2756.00  \n",
      "8                                $38568.00  \n",
      "9                                $76627.00  \n",
      "10                               $71040.00  \n",
      "11                               $86922.00  \n"
     ]
    }
   ],
   "source": [
    "print(fh)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we can read the same file with read_table method with passing an option sep=\n",
    "fh = pd.read_table(\"dataset/Employees2Head.csv\", sep=\",\")\n",
    "len(fh)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AARON,ELVIA J,WATER RATE TAKER,WATER MGMNT,$81000.00,$73862.00\n",
      "AARON,JEFFERY M,POLICE OFFICER,POLICE,$74628.00,$74628.00\n",
      "AARON,KIMBERLEI R,CHIEF CONTRACT EXPEDITER,FLEET MANAGEMNT,$77280.00,$70174.00\n",
      "ABAD JR,VICENTE M,CIVIL ENGINEER IV,WATER MGMNT,$96276.00,$96276.00\n",
      "ABBATACOLA,ROBERT J,ELECTRICAL MECHANIC,WATER MGMNT,$84032.00,$76627.00\n",
      "ABBATE,TERRY M,POLICE OFFICER,POLICE,$79926.00,$79926.00\n",
      "ABBATEMARCO,JAMES J,FIREFIGHTER,FIRE,$77238.00,$77238.00\n",
      "ABBOTT,BETTY L,FOSTER GRANDPARENT,FAMILY & SUPPORT,$2756.00,$2756.00\n",
      "ABBOTT,LYNISE M,CLERK III,FAMILY & SUPPORT,$38568.00,$38568.00\n",
      "ABBOTT,SAM J,ELECTRICAL MECHANIC,TRANSPORTN,$84032.00,$76627.00\n",
      "ABDELHADI,ABDALMAHD,POLICE OFFICER,POLICE,$71040.00,$71040.00\n",
      "ABDELLATIF,AREF R,FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC,FIRE,$86922.00,$86922.00\n"
     ]
    }
   ],
   "source": [
    "# let's check how to read a file without header\n",
    "!more \"dataset/Employees2.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fh = pd.read_csv(\"dataset/Employees2.csv\", header=None)\n",
    "len(fh)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              0            1                                              2  \\\n",
      "0         AARON      ELVIA J                               WATER RATE TAKER   \n",
      "1         AARON    JEFFERY M                                 POLICE OFFICER   \n",
      "2         AARON  KIMBERLEI R                       CHIEF CONTRACT EXPEDITER   \n",
      "3       ABAD JR    VICENTE M                              CIVIL ENGINEER IV   \n",
      "4    ABBATACOLA     ROBERT J                            ELECTRICAL MECHANIC   \n",
      "5        ABBATE      TERRY M                                 POLICE OFFICER   \n",
      "6   ABBATEMARCO      JAMES J                                    FIREFIGHTER   \n",
      "7        ABBOTT      BETTY L                             FOSTER GRANDPARENT   \n",
      "8        ABBOTT     LYNISE M                                      CLERK III   \n",
      "9        ABBOTT        SAM J                            ELECTRICAL MECHANIC   \n",
      "10    ABDELHADI    ABDALMAHD                                 POLICE OFFICER   \n",
      "11   ABDELLATIF       AREF R  FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC   \n",
      "\n",
      "                   3          4          5  \n",
      "0        WATER MGMNT  $81000.00  $73862.00  \n",
      "1             POLICE  $74628.00  $74628.00  \n",
      "2    FLEET MANAGEMNT  $77280.00  $70174.00  \n",
      "3        WATER MGMNT  $96276.00  $96276.00  \n",
      "4        WATER MGMNT  $84032.00  $76627.00  \n",
      "5             POLICE  $79926.00  $79926.00  \n",
      "6               FIRE  $77238.00  $77238.00  \n",
      "7   FAMILY & SUPPORT   $2756.00   $2756.00  \n",
      "8   FAMILY & SUPPORT  $38568.00  $38568.00  \n",
      "9         TRANSPORTN  $84032.00  $76627.00  \n",
      "10            POLICE  $71040.00  $71040.00  \n",
      "11              FIRE  $86922.00  $86922.00  \n"
     ]
    }
   ],
   "source": [
    "print(fh) # check the column names, it've been assigned by pandas itself"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we can assign file dataframe the column names\n",
    "fh.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "          LNAME        FNAME                                      JOB TITLE  \\\n",
      "0         AARON      ELVIA J                               WATER RATE TAKER   \n",
      "1         AARON    JEFFERY M                                 POLICE OFFICER   \n",
      "2         AARON  KIMBERLEI R                       CHIEF CONTRACT EXPEDITER   \n",
      "3       ABAD JR    VICENTE M                              CIVIL ENGINEER IV   \n",
      "4    ABBATACOLA     ROBERT J                            ELECTRICAL MECHANIC   \n",
      "5        ABBATE      TERRY M                                 POLICE OFFICER   \n",
      "6   ABBATEMARCO      JAMES J                                    FIREFIGHTER   \n",
      "7        ABBOTT      BETTY L                             FOSTER GRANDPARENT   \n",
      "8        ABBOTT     LYNISE M                                      CLERK III   \n",
      "9        ABBOTT        SAM J                            ELECTRICAL MECHANIC   \n",
      "10    ABDELHADI    ABDALMAHD                                 POLICE OFFICER   \n",
      "11   ABDELLATIF       AREF R  FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC   \n",
      "\n",
      "          DEPARTMENT EMPLOYEE ANNUAL SALARY  \\\n",
      "0        WATER MGMNT              $81000.00   \n",
      "1             POLICE              $74628.00   \n",
      "2    FLEET MANAGEMNT              $77280.00   \n",
      "3        WATER MGMNT              $96276.00   \n",
      "4        WATER MGMNT              $84032.00   \n",
      "5             POLICE              $79926.00   \n",
      "6               FIRE              $77238.00   \n",
      "7   FAMILY & SUPPORT               $2756.00   \n",
      "8   FAMILY & SUPPORT              $38568.00   \n",
      "9         TRANSPORTN              $84032.00   \n",
      "10            POLICE              $71040.00   \n",
      "11              FIRE              $86922.00   \n",
      "\n",
      "   ESTIMATED ANNUAL SALARY MINUS FURLOUGHS  \n",
      "0                                $73862.00  \n",
      "1                                $74628.00  \n",
      "2                                $70174.00  \n",
      "3                                $96276.00  \n",
      "4                                $76627.00  \n",
      "5                                $79926.00  \n",
      "6                                $77238.00  \n",
      "7                                 $2756.00  \n",
      "8                                $38568.00  \n",
      "9                                $76627.00  \n",
      "10                               $71040.00  \n",
      "11                               $86922.00  \n"
     ]
    }
   ],
   "source": [
    "fh.columns = ['LNAME','FNAME','JOB TITLE','DEPARTMENT','EMPLOYEE ANNUAL SALARY','ESTIMATED ANNUAL SALARY MINUS FURLOUGHS'\n",
    "]\n",
    "print(fh)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# or we can assign the names while reading the files\n",
    "fh = pd.read_table(\"dataset/Employees2.csv\", header=None, sep=',', \n",
    "                   names=['LNAME','FNAME','JOB TITLE','DEPARTMENT','EMPLOYEE ANNUAL SALARY',\n",
    "                          'ESTIMATED ANNUAL SALARY MINUS FURLOUGHS'])\n",
    "len(fh)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "          LNAME        FNAME                                      JOB TITLE  \\\n",
      "0         AARON      ELVIA J                               WATER RATE TAKER   \n",
      "1         AARON    JEFFERY M                                 POLICE OFFICER   \n",
      "2         AARON  KIMBERLEI R                       CHIEF CONTRACT EXPEDITER   \n",
      "3       ABAD JR    VICENTE M                              CIVIL ENGINEER IV   \n",
      "4    ABBATACOLA     ROBERT J                            ELECTRICAL MECHANIC   \n",
      "5        ABBATE      TERRY M                                 POLICE OFFICER   \n",
      "6   ABBATEMARCO      JAMES J                                    FIREFIGHTER   \n",
      "7        ABBOTT      BETTY L                             FOSTER GRANDPARENT   \n",
      "8        ABBOTT     LYNISE M                                      CLERK III   \n",
      "9        ABBOTT        SAM J                            ELECTRICAL MECHANIC   \n",
      "10    ABDELHADI    ABDALMAHD                                 POLICE OFFICER   \n",
      "11   ABDELLATIF       AREF R  FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC   \n",
      "\n",
      "          DEPARTMENT EMPLOYEE ANNUAL SALARY  \\\n",
      "0        WATER MGMNT              $81000.00   \n",
      "1             POLICE              $74628.00   \n",
      "2    FLEET MANAGEMNT              $77280.00   \n",
      "3        WATER MGMNT              $96276.00   \n",
      "4        WATER MGMNT              $84032.00   \n",
      "5             POLICE              $79926.00   \n",
      "6               FIRE              $77238.00   \n",
      "7   FAMILY & SUPPORT               $2756.00   \n",
      "8   FAMILY & SUPPORT              $38568.00   \n",
      "9         TRANSPORTN              $84032.00   \n",
      "10            POLICE              $71040.00   \n",
      "11              FIRE              $86922.00   \n",
      "\n",
      "   ESTIMATED ANNUAL SALARY MINUS FURLOUGHS  \n",
      "0                                $73862.00  \n",
      "1                                $74628.00  \n",
      "2                                $70174.00  \n",
      "3                                $96276.00  \n",
      "4                                $76627.00  \n",
      "5                                $79926.00  \n",
      "6                                $77238.00  \n",
      "7                                 $2756.00  \n",
      "8                                $38568.00  \n",
      "9                                $76627.00  \n",
      "10                               $71040.00  \n",
      "11                               $86922.00  \n"
     ]
    }
   ],
   "source": [
    "print(fh)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# In some cases, a table might not have a fixed delimiter, using whitespace or some other\n",
    "# pattern to separate fields. In these cases, you can pass a regular expression as a delimiter\n",
    "# for read_table.\n",
    "# pd.read_table('ch06/ex3.txt', sep='\\s+')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Skip the rows to read\n",
    "fh = pd.read_table(\"dataset/Employees2Head.csv\", sep=\",\", skiprows=[0,1,2])\n",
    "len(fh)  # total was 12 rows but skipped 3 rows"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Handling missing data while reading files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "LNAME,FNAME,JOB TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY,ESTIMATED ANNUAL SALARY MINUS FURLOUGHS\n",
      "AARON,ELVIA J,,WATER MGMNT,NULL,$73862.00\n",
      ",JEFFERY M,POLICE OFFICER,POLICE,$74628.00,$74628.00\n",
      "AARON,,CHIEF CONTRACT EXPEDITER,NaN,$77280.00,$70174.00\n",
      "ABAD JR,VICENTE M,CIVIL ENGINEER IV,WATER MGMNT,$96276.00,NaN\n",
      "|,ROBERT J,,NULL,$84032.00,$76627.00\n",
      "ABBATE,TERRY M,POLICE OFFICER,POLICE,$79926.00,NaN\n",
      "NULL,JAMES J,NA,FIRE,NULL,$77238.00\n",
      "NaN,,FOSTER GRANDPARENT,FAMILY & SUPPORT,NA,$2756.00\n",
      "NA,LYNISE M,CLERK III,FAMILY & SUPPORT,$38568.00,$38568.00\n",
      "|,SAM J,ELECTRICAL MECHANIC,TRANSPORTN,$84032.00,$76627.00\n",
      "ABDELHADI,ABDALMAHD,POLICE OFFICER,POLICE,$71040.00,$71040.00\n",
      ",AREF R,NA,FIRE,,$86922.00\n"
     ]
    }
   ],
   "source": [
    "!more \"dataset/Emp2HeadMiss.csv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        LNAME      FNAME                 JOB TITLE        DEPARTMENT  \\\n",
      "0       AARON    ELVIA J                       NaN       WATER MGMNT   \n",
      "1         NaN  JEFFERY M            POLICE OFFICER            POLICE   \n",
      "2       AARON        NaN  CHIEF CONTRACT EXPEDITER               NaN   \n",
      "3     ABAD JR  VICENTE M         CIVIL ENGINEER IV       WATER MGMNT   \n",
      "4           |   ROBERT J                       NaN               NaN   \n",
      "5      ABBATE    TERRY M            POLICE OFFICER            POLICE   \n",
      "6         NaN    JAMES J                       NaN              FIRE   \n",
      "7         NaN        NaN        FOSTER GRANDPARENT  FAMILY & SUPPORT   \n",
      "8         NaN   LYNISE M                 CLERK III  FAMILY & SUPPORT   \n",
      "9           |      SAM J       ELECTRICAL MECHANIC        TRANSPORTN   \n",
      "10  ABDELHADI  ABDALMAHD            POLICE OFFICER            POLICE   \n",
      "11        NaN     AREF R                       NaN              FIRE   \n",
      "\n",
      "   EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS  \n",
      "0                     NaN                               $73862.00  \n",
      "1               $74628.00                               $74628.00  \n",
      "2               $77280.00                               $70174.00  \n",
      "3               $96276.00                                     NaN  \n",
      "4               $84032.00                               $76627.00  \n",
      "5               $79926.00                                     NaN  \n",
      "6                     NaN                               $77238.00  \n",
      "7                     NaN                                $2756.00  \n",
      "8               $38568.00                               $38568.00  \n",
      "9               $84032.00                               $76627.00  \n",
      "10              $71040.00                               $71040.00  \n",
      "11                    NaN                               $86922.00  \n"
     ]
    }
   ],
   "source": [
    "# reading the file\n",
    "fh = pd.read_csv(\"dataset/Emp2HeadMiss.csv\")\n",
    "print(fh)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>LNAME</th>\n",
       "      <th>FNAME</th>\n",
       "      <th>JOB TITLE</th>\n",
       "      <th>DEPARTMENT</th>\n",
       "      <th>EMPLOYEE ANNUAL SALARY</th>\n",
       "      <th>ESTIMATED ANNUAL SALARY MINUS FURLOUGHS</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    LNAME  FNAME JOB TITLE DEPARTMENT EMPLOYEE ANNUAL SALARY  \\\n",
       "0   False  False      True      False                   True   \n",
       "1    True  False     False      False                  False   \n",
       "2   False   True     False       True                  False   \n",
       "3   False  False     False      False                  False   \n",
       "4   False  False      True       True                  False   \n",
       "5   False  False     False      False                  False   \n",
       "6    True  False      True      False                   True   \n",
       "7    True   True     False      False                   True   \n",
       "8    True  False     False      False                  False   \n",
       "9   False  False     False      False                  False   \n",
       "10  False  False     False      False                  False   \n",
       "11   True  False      True      False                   True   \n",
       "\n",
       "   ESTIMATED ANNUAL SALARY MINUS FURLOUGHS  \n",
       "0                                    False  \n",
       "1                                    False  \n",
       "2                                    False  \n",
       "3                                     True  \n",
       "4                                    False  \n",
       "5                                     True  \n",
       "6                                    False  \n",
       "7                                    False  \n",
       "8                                    False  \n",
       "9                                    False  \n",
       "10                                   False  \n",
       "11                                   False  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.isnull(fh)  # TRUE denotes the NULL value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# The na_values option can take either a list or set of strings to consider missing values\n",
    "fh = pd.read_csv(\"dataset/Emp2HeadMiss.csv\", na_values=['NULL'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        LNAME      FNAME                 JOB TITLE        DEPARTMENT  \\\n",
      "0       AARON    ELVIA J                       NaN       WATER MGMNT   \n",
      "1         NaN  JEFFERY M            POLICE OFFICER            POLICE   \n",
      "2       AARON        NaN  CHIEF CONTRACT EXPEDITER               NaN   \n",
      "3     ABAD JR  VICENTE M         CIVIL ENGINEER IV       WATER MGMNT   \n",
      "4         NaN   ROBERT J                       NaN               NaN   \n",
      "5      ABBATE    TERRY M            POLICE OFFICER            POLICE   \n",
      "6         NaN    JAMES J                       NaN              FIRE   \n",
      "7         NaN        NaN        FOSTER GRANDPARENT  FAMILY & SUPPORT   \n",
      "8         NaN   LYNISE M                 CLERK III  FAMILY & SUPPORT   \n",
      "9         NaN      SAM J       ELECTRICAL MECHANIC        TRANSPORTN   \n",
      "10  ABDELHADI  ABDALMAHD            POLICE OFFICER            POLICE   \n",
      "11        NaN     AREF R                       NaN              FIRE   \n",
      "\n",
      "   EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS  \n",
      "0                     NaN                               $73862.00  \n",
      "1               $74628.00                               $74628.00  \n",
      "2               $77280.00                               $70174.00  \n",
      "3               $96276.00                                     NaN  \n",
      "4               $84032.00                               $76627.00  \n",
      "5               $79926.00                                     NaN  \n",
      "6                     NaN                               $77238.00  \n",
      "7                     NaN                                $2756.00  \n",
      "8               $38568.00                               $38568.00  \n",
      "9               $84032.00                               $76627.00  \n",
      "10              $71040.00                               $71040.00  \n",
      "11                    NaN                               $86922.00  \n"
     ]
    }
   ],
   "source": [
    "# we can define NA values for each columns as well\n",
    "naValues = {'LNAME':['NA', 'NaN', 'NULL', '|'], 'FNAME':['NULL']}\n",
    "fh = pd.read_csv(\"dataset/Emp2HeadMiss.csv\", na_values=naValues)\n",
    "print(fh)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### read_csv /read_table function arguments\n",
    "```\n",
    "path                        String indicating filesystem location, URL, or file-like object\n",
    "sep or delimiter            Character sequence or regular expression to use to split fields in each row\n",
    "header                      Row number to use as column names. Defaults to 0 (first row), but should be None if there is no header row\n",
    "index_col                   Column numbers or names to use as the row index in the result. Can be a single name/number or a list of them for a hierarchical index\n",
    "names                       List of column names for result, combine with header=None\n",
    "skiprows                    Number of rows at beginning of file to ignore or list of row numbers (starting from 0) to skip\n",
    "na_values                   Sequence of values to replace with NA\n",
    "comment                     Character or characters to split comments off the end of lines\n",
    "parse_dates                 Attempt to parse data to datetime; False by default. If True, will attempt to parse all columns. Otherwise can specify a list of column numbers or name to parse. If element of list is tuple or list, will combine multiple columns together and parse to date (for example if date/time split across two columns)\n",
    "keep_date_col               If joining columns to parse date, drop the joined columns. Default True\n",
    "converters                  Dict containing column number of name mapping to functions. For example {'foo': f} would apply the function f to all values in the 'foo' column\n",
    "dayfirst                    When parsing potentially ambiguous dates, treat as international format (e.g. 7/6/2012 -> June 7, 2012). Default False\n",
    "date_parser                 Function to use to parse dates\n",
    "nrows                       Number of rows to read from beginning of file\n",
    "iterator                    Return a TextParser object for reading file piecemeal\n",
    "chunksize                   For iteration, size of file chunks\n",
    "skip_footer                 Number of lines to ignore at end of file\n",
    "verbose                     Print various parser output information, like the number of missing values placed in non-numeric columns\n",
    "encoding                    Text encoding for unicode. For example 'utf-8' for UTF-8 encoded text\n",
    "squeeze                     If the parsed data only contains one column return a Series\n",
    "thousands                   Separator for thousands, e.g. ',' or '.'\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reading Text Files in Pieces"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "34218"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fh = pd.read_csv(\"dataset/Employees.csv\")\n",
    "len(fh)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 34218 entries, 0 to 34217\n",
      "Data columns (total 6 columns):\n",
      "LNAME                                      34218 non-null object\n",
      "FNAME                                      34218 non-null object\n",
      "JOB TITLE                                  34218 non-null object\n",
      "DEPARTMENT                                 34218 non-null object\n",
      "EMPLOYEE ANNUAL SALARY                     34218 non-null object\n",
      "ESTIMATED ANNUAL SALARY MINUS FURLOUGHS    34218 non-null object\n",
      "dtypes: object(6)\n",
      "memory usage: 1.6+ MB\n"
     ]
    }
   ],
   "source": [
    "fh.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>LNAME</th>\n",
       "      <th>FNAME</th>\n",
       "      <th>JOB TITLE</th>\n",
       "      <th>DEPARTMENT</th>\n",
       "      <th>EMPLOYEE ANNUAL SALARY</th>\n",
       "      <th>ESTIMATED ANNUAL SALARY MINUS FURLOUGHS</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>34218</td>\n",
       "      <td>34218</td>\n",
       "      <td>34218</td>\n",
       "      <td>34218</td>\n",
       "      <td>34218</td>\n",
       "      <td>34218</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>14431</td>\n",
       "      <td>12188</td>\n",
       "      <td>1124</td>\n",
       "      <td>39</td>\n",
       "      <td>1028</td>\n",
       "      <td>1088</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>WILLIAMS</td>\n",
       "      <td>MICHAEL J</td>\n",
       "      <td>POLICE OFFICER</td>\n",
       "      <td>POLICE</td>\n",
       "      <td>$77238.00</td>\n",
       "      <td>$77238.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>277</td>\n",
       "      <td>309</td>\n",
       "      <td>10918</td>\n",
       "      <td>14147</td>\n",
       "      <td>3028</td>\n",
       "      <td>3028</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           LNAME      FNAME       JOB TITLE DEPARTMENT EMPLOYEE ANNUAL SALARY  \\\n",
       "count      34218      34218           34218      34218                  34218   \n",
       "unique     14431      12188            1124         39                   1028   \n",
       "top     WILLIAMS  MICHAEL J  POLICE OFFICER     POLICE              $77238.00   \n",
       "freq         277        309           10918      14147                   3028   \n",
       "\n",
       "       ESTIMATED ANNUAL SALARY MINUS FURLOUGHS  \n",
       "count                                    34218  \n",
       "unique                                    1088  \n",
       "top                                  $77238.00  \n",
       "freq                                      3028  "
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fh.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        LNAME        FNAME                 JOB TITLE       DEPARTMENT  \\\n",
      "0       AARON      ELVIA J          WATER RATE TAKER      WATER MGMNT   \n",
      "1       AARON    JEFFERY M            POLICE OFFICER           POLICE   \n",
      "2       AARON  KIMBERLEI R  CHIEF CONTRACT EXPEDITER  FLEET MANAGEMNT   \n",
      "3     ABAD JR    VICENTE M         CIVIL ENGINEER IV      WATER MGMNT   \n",
      "4  ABBATACOLA     ROBERT J       ELECTRICAL MECHANIC      WATER MGMNT   \n",
      "\n",
      "  EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS  \n",
      "0              $81000.00                               $73862.00  \n",
      "1              $74628.00                               $74628.00  \n",
      "2              $77280.00                               $70174.00  \n",
      "3              $96276.00                               $96276.00  \n",
      "4              $84032.00                               $76627.00  \n"
     ]
    }
   ],
   "source": [
    "# if want to read few lines from file\n",
    "fh = pd.read_csv(\"dataset/Employees.csv\", nrows=5)\n",
    "print(fh)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pandas.io.parsers.TextFileReader at 0x178479579e8>"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# To read out a file in pieces, specify a chunksize as a number of rows\n",
    "chunker = pd.read_csv(\"dataset/Employees.csv\", chunksize=1000)\n",
    "chunker"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "35\n"
     ]
    }
   ],
   "source": [
    "line = 0\n",
    "for data in chunker:\n",
    "    line = line + 1\n",
    "print(line)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Writing Data Out to Text Format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "34218"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fh = pd.read_csv(\"dataset/Employees.csv\")\n",
    "len(fh)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# save the fh as csv\n",
    "fh.to_csv(\"dataset/Emp.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# save the fh as excel\n",
    "fh.to_excel(\"dataset/Emp.xls\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Other delimiters can be used\n",
    "# fh.to_csv(\"dataset/Emp.csv\", sep=\"|\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Missing values appear as empty strings in the output. You might want to denote them\n",
    "# by some other sentinel value:\n",
    "# sentinel = ['NULL']\n",
    "# data.to_csv(sys.stdout, na_rep=sentinel)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# With no other options specified, both the row and column labels are written. Both of\n",
    "# these can be disabled:\n",
    "fh.to_csv(\"dataset/Emp.csv\", header = False, index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# can write column subset as well\n",
    "cols = ['LNAME', 'FNAME']\n",
    "fh.to_csv(\"dataset/Emp.csv\", index = False, columns=cols)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Save Series"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dates = pd.date_range('1/1/2000', periods=7)\n",
    "ts = pd.Series(np.arange(7), index=dates)\n",
    "ts.to_csv(\"dataset/times.psv\", sep=\"|\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2000-01-01|0\n",
      "2000-01-02|1\n",
      "2000-01-03|2\n",
      "2000-01-04|3\n",
      "2000-01-05|4\n",
      "2000-01-06|5\n",
      "2000-01-07|6\n"
     ]
    }
   ],
   "source": [
    "!more \"dataset/times.psv\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2000-01-01    0\n",
       "2000-01-02    1\n",
       "2000-01-03    2\n",
       "2000-01-04    3\n",
       "2000-01-05    4\n",
       "2000-01-06    5\n",
       "2000-01-07    6\n",
       "dtype: int64"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# With a bit of wrangling (no header, first column as index), you can read a CSV version\n",
    "# of a Series with read_csv, but there is also a from_csv convenience method that makes\n",
    "# it a bit simpler:\n",
    "pd.Series.from_csv('dataset/times.psv', parse_dates=True, sep=\"|\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Manually Working with Delimited Formats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "34219\n"
     ]
    }
   ],
   "source": [
    "# import csv\n",
    "import csv\n",
    "fh = open(\"dataset/Emp.csv\")\n",
    "reader = csv.reader(fh)\n",
    "\n",
    "count=0\n",
    "for line in reader:\n",
    "    count = count+1\n",
    "print(count)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['LNAME', 'FNAME']"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lines = list(csv.reader(open(\"dataset/Emp.csv\")))\n",
    "header, values = lines[0], lines[1:]\n",
    "header"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['LNAME', 'FNAME'])"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_dict = {h: v for h, v in zip(header, zip(*values))}\n",
    "data_dict.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<_csv.reader object at 0x0000017847958D48>\n"
     ]
    }
   ],
   "source": [
    "# Defining a new format with a different delimiter, string quoting convention, \n",
    "# or line terminator is done by defining a simple subclass of csv.Dialect\n",
    "f = open(\"dataset/times.psv\")\n",
    "\n",
    "class my_dialect(csv.Dialect):\n",
    "    lineterminator = '\\n'\n",
    "    delimiter = '|'\n",
    "    quoting = csv.QUOTE_NONE\n",
    "    \n",
    "reader = csv.reader(f, dialect=my_dialect)\n",
    "print(reader)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['2000-01-01', '0']\n",
      "['2000-01-02', '1']\n",
      "['2000-01-03', '2']\n",
      "['2000-01-04', '3']\n",
      "['2000-01-05', '4']\n",
      "['2000-01-06', '5']\n",
      "['2000-01-07', '6']\n"
     ]
    }
   ],
   "source": [
    "# or we can define as\n",
    "reader = csv.reader(open(\"dataset/times.psv\"), delimiter = '|')\n",
    "for line in reader:\n",
    "    print(line)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### CSV dialect options\n",
    "\n",
    "```\n",
    "delimiter                   One-character string to separate fields. Defaults to ','.\n",
    "lineterminator              Line terminator for writing, defaults to '\\r\\n'. Reader ignores this and recognizes cross-platform line terminators.\n",
    "quotechar                   Quote character for fields with special characters (like a delimiter). Default is '\"'.\n",
    "quoting                     Quoting convention. Options include csv.QUOTE_ALL (quote all fields),\n",
    "                            csv.QUOTE_MINIMAL (only fields with special characters like the delimiter),\n",
    "                            csv.QUOTE_NONNUMERIC, and csv.QUOTE_NON (no quoting). See Python’s documentation for full details. Defaults to QUOTE_MINIMAL.\n",
    "skipinitialspace            Ignore whitespace after each delimiter. Default False.\n",
    "doublequote                 How to handle quoting character inside a field. If True, it is doubled. \n",
    "escapechar                  String to escape the delimiter if quoting is set to csv.QUOTE_NONE. Disabled by default\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# To write delimited files manually, use csv.writer.\n",
    "with open('dataset/mydata.csv', 'w') as f:\n",
    "    writer = csv.writer(f, dialect=my_dialect)\n",
    "    writer.writerow(('one', 'two', 'three'))\n",
    "    writer.writerow(('1', '2', '3'))\n",
    "    writer.writerow(('4', '5', '6'))\n",
    "    writer.writerow(('7', '8', '9'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### JSON Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n{\"name\": \"Wes\",\\n \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\\n \"pet\": null,\\n \"siblings\": [{\"name\": \"Scott\", \"age\": 25, \"pet\": \"Zuko\"},\\n              {\"name\": \"Katie\", \"age\": 33, \"pet\": \"Cisco\"}]\\n}\\n'"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "obj = \"\"\"\n",
    "{\"name\": \"Wes\",\n",
    " \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\n",
    " \"pet\": null,\n",
    " \"siblings\": [{\"name\": \"Scott\", \"age\": 25, \"pet\": \"Zuko\"},\n",
    "              {\"name\": \"Katie\", \"age\": 33, \"pet\": \"Cisco\"}]\n",
    "}\n",
    "\"\"\"\n",
    "obj"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'name': 'Wes',\n",
       " 'pet': None,\n",
       " 'places_lived': ['United States', 'Spain', 'Germany'],\n",
       " 'siblings': [{'age': 25, 'name': 'Scott', 'pet': 'Zuko'},\n",
       "  {'age': 33, 'name': 'Katie', 'pet': 'Cisco'}]}"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import json\n",
    "data = json.loads(obj)\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'{\"name\": \"Wes\", \"siblings\": [{\"name\": \"Scott\", \"pet\": \"Zuko\", \"age\": 25}, {\"name\": \"Katie\", \"pet\": \"Cisco\", \"age\": 33}], \"places_lived\": [\"United States\", \"Spain\", \"Germany\"], \"pet\": null}'"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "asjson = json.dumps(data)\n",
    "asjson"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Scott</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Katie</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    name  age\n",
       "0  Scott   25\n",
       "1  Katie   33"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "siblings = pd.DataFrame(data['siblings'], columns=['name', 'age'])\n",
    "siblings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### XML and HTML: Web Scraping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# lxml pkg\n",
    "from lxml.html import parse\n",
    "# from urllib2 import urlopen  # for python 2\n",
    "from urllib.request import urlopen"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# this will download some data from net\n",
    "#parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))\n",
    "parsed = parse(urlopen('http://finance.yahoo.com'))\n",
    "#parsed = parse(urlopen('http://www.omdbapi.com/'))\n",
    "doc = parsed.getroot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<Element html at 0x17847b09958>\n"
     ]
    }
   ],
   "source": [
    "print(doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "49"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# to fetch all hyperlinks <a\n",
    "links = doc.findall('.//a')\n",
    "len(links)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://www.yahoo.com/tech'"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lnk = links[18]\n",
    "lnk.get('href')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Technology'"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lnk.text_content()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['https://www.yahoo.com/', 'https://mail.yahoo.com/?.intl=us&.lang=en-US', 'https://www.flickr.com/', 'https://www.tumblr.com/', 'https://www.yahoo.com/news/', 'http://sports.yahoo.com/', 'http://finance.yahoo.com/', 'https://www.yahoo.com/celebrity/', 'https://answers.yahoo.com/', 'https://groups.yahoo.com/']\n"
     ]
    }
   ],
   "source": [
    "# to get all the urls in one liner\n",
    "urls = [lnk.get('href') for lnk in doc.findall('.//a')]\n",
    "print(urls[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\n",
      "<Element table at 0x17847b21098>\n"
     ]
    }
   ],
   "source": [
    "## Finding the tables in html \n",
    "tables = doc.findall('.//table')\n",
    "print(len(tables))\n",
    "\n",
    "table = tables[0]\n",
    "print(table)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<Element tr at 0x17847a70a98>]\n"
     ]
    }
   ],
   "source": [
    "# table will have only 1 table\n",
    "# get td tag\n",
    "rows = table.findall('.//tr')\n",
    "print(rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# For the header as well as the data rows, we want to extract the text from each cell; in\n",
    "# the case of the header these are th cells and td cells for the data:\n",
    "def _unpack(row, kind='td'):\n",
    "    elts = row.findall('.//%s' % kind)\n",
    "    return [val.text_content() for val in elts]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# printing table header th tag\n",
    "_unpack(rows[0], kind='th')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['', 'Search']"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# printing table data td tag\n",
    "_unpack(rows[0], kind='td')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# combining all the code into one func\n",
    "from pandas.io.parsers import TextParser\n",
    "def parse_options_data(table):\n",
    "    rows = table.findall('.//tr')\n",
    "    header = _unpack(rows[0], kind='th')\n",
    "    data = [_unpack(r) for r in rows[1:]]\n",
    "    return TextParser(data, names=header).get_chunk()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "EmptyDataError",
     "evalue": "No columns to parse from file",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_next_line\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1986\u001b[0m                 \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1987\u001b[0;31m                     \u001b[0mline\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_comments\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpos\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1988\u001b[0m                     \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpos\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mIndexError\u001b[0m: list index out of range",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mStopIteration\u001b[0m                             Traceback (most recent call last)",
      "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_infer_columns\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1906\u001b[0m             \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1907\u001b[0;31m                 \u001b[0mline\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_buffered_line\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1908\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_buffered_line\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1974\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1975\u001b[0;31m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_next_line\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1976\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_next_line\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1998\u001b[0m                 \u001b[1;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1999\u001b[0;31m                     \u001b[1;32mraise\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2000\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mStopIteration\u001b[0m: ",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[0;31mEmptyDataError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-59-88fac389ba91>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparse_options_data\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtable\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m<ipython-input-58-92759a96dbe8>\u001b[0m in \u001b[0;36mparse_options_data\u001b[0;34m(table)\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0mheader\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_unpack\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrows\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkind\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'th'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m     \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0m_unpack\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mr\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mr\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrows\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m     \u001b[1;32mreturn\u001b[0m \u001b[0mTextParser\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mheader\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_chunk\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mTextParser\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m   1470\u001b[0m     \"\"\"\n\u001b[1;32m   1471\u001b[0m     \u001b[0mkwds\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'engine'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'python'\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1472\u001b[0;31m     \u001b[1;32mreturn\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1473\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m   1474\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m    643\u001b[0m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'has_index_names'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'has_index_names'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m    644\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m--> 645\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    646\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m    647\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m    803\u001b[0m             \u001b[1;32melif\u001b[0m \u001b[0mengine\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'python-fwf'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m    804\u001b[0m                 \u001b[0mklass\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mFixedWidthFieldParser\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m--> 805\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mklass\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    806\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m    807\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_failover_to_python\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, **kwds)\u001b[0m\n\u001b[1;32m   1606\u001b[0m         \u001b[1;31m# infer column indices from self.usecols if is is specified.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m   1607\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_col_indices\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1608\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnum_original_columns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_infer_columns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1609\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m   1610\u001b[0m         \u001b[1;31m# Now self.columns has the set of columns that we will process.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_infer_columns\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1910\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mnames\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m   1911\u001b[0m                     raise EmptyDataError(\n\u001b[0;32m-> 1912\u001b[0;31m                         \"No columns to parse from file\")\n\u001b[0m\u001b[1;32m   1913\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m   1914\u001b[0m                 \u001b[0mline\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnames\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mEmptyDataError\u001b[0m: No columns to parse from file"
     ]
    }
   ],
   "source": [
    "data = parse_options_data(table)\n",
    "print(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Parsing XML with lxml.objectify"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from lxml import objectify\n",
    "parsed = objectify.parse(open(\"dataset/Performance_MNR.xml\"))\n",
    "root = parsed.getroot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "print(root)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data = []\n",
    "skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',\n",
    "               'DESIRED_CHANGE', 'DECIMAL_PLACES']\n",
    "for elt in root.INDICATOR:\n",
    "    el_data = {}\n",
    "    for child in elt.getchildren():\n",
    "        if child.tag in skip_fields:\n",
    "            continue\n",
    "        el_data[child.tag] = child.pyval\n",
    "    data.append(el_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "perf = pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>AGENCY_NAME</th>\n",
       "      <th>CATEGORY</th>\n",
       "      <th>DESCRIPTION</th>\n",
       "      <th>FREQUENCY</th>\n",
       "      <th>INDICATOR_NAME</th>\n",
       "      <th>INDICATOR_UNIT</th>\n",
       "      <th>MONTHLY_ACTUAL</th>\n",
       "      <th>MONTHLY_TARGET</th>\n",
       "      <th>PERIOD_MONTH</th>\n",
       "      <th>PERIOD_YEAR</th>\n",
       "      <th>YTD_ACTUAL</th>\n",
       "      <th>YTD_TARGET</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Metro-North Railroad</td>\n",
       "      <td>Service Indicators</td>\n",
       "      <td>Percent of commuter trains that arrive at thei...</td>\n",
       "      <td>M</td>\n",
       "      <td>On-Time Performance (West of Hudson)</td>\n",
       "      <td>%</td>\n",
       "      <td>96.9</td>\n",
       "      <td>95</td>\n",
       "      <td>1</td>\n",
       "      <td>2008</td>\n",
       "      <td>96.9</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Metro-North Railroad</td>\n",
       "      <td>Service Indicators</td>\n",
       "      <td>Percent of commuter trains that arrive at thei...</td>\n",
       "      <td>M</td>\n",
       "      <td>On-Time Performance (West of Hudson)</td>\n",
       "      <td>%</td>\n",
       "      <td>95</td>\n",
       "      <td>95</td>\n",
       "      <td>2</td>\n",
       "      <td>2008</td>\n",
       "      <td>96</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Metro-North Railroad</td>\n",
       "      <td>Service Indicators</td>\n",
       "      <td>Percent of commuter trains that arrive at thei...</td>\n",
       "      <td>M</td>\n",
       "      <td>On-Time Performance (West of Hudson)</td>\n",
       "      <td>%</td>\n",
       "      <td>96.9</td>\n",
       "      <td>95</td>\n",
       "      <td>3</td>\n",
       "      <td>2008</td>\n",
       "      <td>96.3</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Metro-North Railroad</td>\n",
       "      <td>Service Indicators</td>\n",
       "      <td>Percent of commuter trains that arrive at thei...</td>\n",
       "      <td>M</td>\n",
       "      <td>On-Time Performance (West of Hudson)</td>\n",
       "      <td>%</td>\n",
       "      <td>98.3</td>\n",
       "      <td>95</td>\n",
       "      <td>4</td>\n",
       "      <td>2008</td>\n",
       "      <td>96.8</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Metro-North Railroad</td>\n",
       "      <td>Service Indicators</td>\n",
       "      <td>Percent of commuter trains that arrive at thei...</td>\n",
       "      <td>M</td>\n",
       "      <td>On-Time Performance (West of Hudson)</td>\n",
       "      <td>%</td>\n",
       "      <td>95.8</td>\n",
       "      <td>95</td>\n",
       "      <td>5</td>\n",
       "      <td>2008</td>\n",
       "      <td>96.6</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            AGENCY_NAME            CATEGORY  \\\n",
       "0  Metro-North Railroad  Service Indicators   \n",
       "1  Metro-North Railroad  Service Indicators   \n",
       "2  Metro-North Railroad  Service Indicators   \n",
       "3  Metro-North Railroad  Service Indicators   \n",
       "4  Metro-North Railroad  Service Indicators   \n",
       "\n",
       "                                         DESCRIPTION FREQUENCY  \\\n",
       "0  Percent of commuter trains that arrive at thei...         M   \n",
       "1  Percent of commuter trains that arrive at thei...         M   \n",
       "2  Percent of commuter trains that arrive at thei...         M   \n",
       "3  Percent of commuter trains that arrive at thei...         M   \n",
       "4  Percent of commuter trains that arrive at thei...         M   \n",
       "\n",
       "                         INDICATOR_NAME INDICATOR_UNIT MONTHLY_ACTUAL  \\\n",
       "0  On-Time Performance (West of Hudson)              %           96.9   \n",
       "1  On-Time Performance (West of Hudson)              %             95   \n",
       "2  On-Time Performance (West of Hudson)              %           96.9   \n",
       "3  On-Time Performance (West of Hudson)              %           98.3   \n",
       "4  On-Time Performance (West of Hudson)              %           95.8   \n",
       "\n",
       "  MONTHLY_TARGET  PERIOD_MONTH  PERIOD_YEAR YTD_ACTUAL YTD_TARGET  \n",
       "0             95             1         2008       96.9         95  \n",
       "1             95             2         2008         96         95  \n",
       "2             95             3         2008       96.3         95  \n",
       "3             95             4         2008       96.8         95  \n",
       "4             95             5         2008       96.6         95  "
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "perf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}