{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# import\n", "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Parsing functions in pandas\n", "```\n", "read_csv Load delimited data from a file, URL, or file-like object. Use comma as default delimiter\n", "read_table Load delimited data from a file, URL, or file-like object. Use tab ('\\t') as default delimiter\n", "read_fwf Read data in fixed-width column format (that is, no delimiters)\n", "read_clipboard Version of read_table that reads data from the clipboard. Useful for converting tables from web pages\n", "```" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LNAME,FNAME,JOB TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY,ESTIMATED ANNUAL SALARY MINUS FURLOUGHS\n", "AARON,ELVIA J,WATER RATE TAKER,WATER MGMNT,$81000.00,$73862.00\n", "AARON,JEFFERY M,POLICE OFFICER,POLICE,$74628.00,$74628.00\n", "AARON,KIMBERLEI R,CHIEF CONTRACT EXPEDITER,FLEET MANAGEMNT,$77280.00,$70174.00\n", "ABAD JR,VICENTE M,CIVIL ENGINEER IV,WATER MGMNT,$96276.00,$96276.00\n", "ABBATACOLA,ROBERT J,ELECTRICAL MECHANIC,WATER MGMNT,$84032.00,$76627.00\n", "ABBATE,TERRY M,POLICE OFFICER,POLICE,$79926.00,$79926.00\n", "ABBATEMARCO,JAMES J,FIREFIGHTER,FIRE,$77238.00,$77238.00\n", "ABBOTT,BETTY L,FOSTER GRANDPARENT,FAMILY & SUPPORT,$2756.00,$2756.00\n", "ABBOTT,LYNISE M,CLERK III,FAMILY & SUPPORT,$38568.00,$38568.00\n", "ABBOTT,SAM J,ELECTRICAL MECHANIC,TRANSPORTN,$84032.00,$76627.00\n", "ABDELHADI,ABDALMAHD,POLICE OFFICER,POLICE,$71040.00,$71040.00\n", "ABDELLATIF,AREF R,FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC,FIRE,$86922.00,$86922.00\n" ] } ], "source": [ "# reading csv file, first check the content of it\n", "!more \"dataset/Employees2Head.csv\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "12" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# this file is having header and delimiter is comma (,), we can read this with read_csv method\n", "fh = pd.read_csv(\"dataset/Employees2Head.csv\")\n", "len(fh) # of lines in file excluding header" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " LNAME FNAME JOB TITLE \\\n", "0 AARON ELVIA J WATER RATE TAKER \n", "1 AARON JEFFERY M POLICE OFFICER \n", "2 AARON KIMBERLEI R CHIEF CONTRACT EXPEDITER \n", "3 ABAD JR VICENTE M CIVIL ENGINEER IV \n", "4 ABBATACOLA ROBERT J ELECTRICAL MECHANIC \n", "5 ABBATE TERRY M POLICE OFFICER \n", "6 ABBATEMARCO JAMES J FIREFIGHTER \n", "7 ABBOTT BETTY L FOSTER GRANDPARENT \n", "8 ABBOTT LYNISE M CLERK III \n", "9 ABBOTT SAM J ELECTRICAL MECHANIC \n", "10 ABDELHADI ABDALMAHD POLICE OFFICER \n", "11 ABDELLATIF AREF R FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC \n", "\n", " DEPARTMENT EMPLOYEE ANNUAL SALARY \\\n", "0 WATER MGMNT $81000.00 \n", "1 POLICE $74628.00 \n", "2 FLEET MANAGEMNT $77280.00 \n", "3 WATER MGMNT $96276.00 \n", "4 WATER MGMNT $84032.00 \n", "5 POLICE $79926.00 \n", "6 FIRE $77238.00 \n", "7 FAMILY & SUPPORT $2756.00 \n", "8 FAMILY & SUPPORT $38568.00 \n", "9 TRANSPORTN $84032.00 \n", "10 POLICE $71040.00 \n", "11 FIRE $86922.00 \n", "\n", " ESTIMATED ANNUAL SALARY MINUS FURLOUGHS \n", "0 $73862.00 \n", "1 $74628.00 \n", "2 $70174.00 \n", "3 $96276.00 \n", "4 $76627.00 \n", "5 $79926.00 \n", "6 $77238.00 \n", "7 $2756.00 \n", "8 $38568.00 \n", "9 $76627.00 \n", "10 $71040.00 \n", "11 $86922.00 \n" ] } ], "source": [ "print(fh)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "12" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we can read the same file with read_table method with passing an option sep=\n", "fh = pd.read_table(\"dataset/Employees2Head.csv\", sep=\",\")\n", "len(fh)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AARON,ELVIA J,WATER RATE TAKER,WATER MGMNT,$81000.00,$73862.00\n", "AARON,JEFFERY M,POLICE OFFICER,POLICE,$74628.00,$74628.00\n", "AARON,KIMBERLEI R,CHIEF CONTRACT EXPEDITER,FLEET MANAGEMNT,$77280.00,$70174.00\n", "ABAD JR,VICENTE M,CIVIL ENGINEER IV,WATER MGMNT,$96276.00,$96276.00\n", "ABBATACOLA,ROBERT J,ELECTRICAL MECHANIC,WATER MGMNT,$84032.00,$76627.00\n", "ABBATE,TERRY M,POLICE OFFICER,POLICE,$79926.00,$79926.00\n", "ABBATEMARCO,JAMES J,FIREFIGHTER,FIRE,$77238.00,$77238.00\n", "ABBOTT,BETTY L,FOSTER GRANDPARENT,FAMILY & SUPPORT,$2756.00,$2756.00\n", "ABBOTT,LYNISE M,CLERK III,FAMILY & SUPPORT,$38568.00,$38568.00\n", "ABBOTT,SAM J,ELECTRICAL MECHANIC,TRANSPORTN,$84032.00,$76627.00\n", "ABDELHADI,ABDALMAHD,POLICE OFFICER,POLICE,$71040.00,$71040.00\n", "ABDELLATIF,AREF R,FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC,FIRE,$86922.00,$86922.00\n" ] } ], "source": [ "# let's check how to read a file without header\n", "!more \"dataset/Employees2.csv\"" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "12" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fh = pd.read_csv(\"dataset/Employees2.csv\", header=None)\n", "len(fh)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0 1 2 \\\n", "0 AARON ELVIA J WATER RATE TAKER \n", "1 AARON JEFFERY M POLICE OFFICER \n", "2 AARON KIMBERLEI R CHIEF CONTRACT EXPEDITER \n", "3 ABAD JR VICENTE M CIVIL ENGINEER IV \n", "4 ABBATACOLA ROBERT J ELECTRICAL MECHANIC \n", "5 ABBATE TERRY M POLICE OFFICER \n", "6 ABBATEMARCO JAMES J FIREFIGHTER \n", "7 ABBOTT BETTY L FOSTER GRANDPARENT \n", "8 ABBOTT LYNISE M CLERK III \n", "9 ABBOTT SAM J ELECTRICAL MECHANIC \n", "10 ABDELHADI ABDALMAHD POLICE OFFICER \n", "11 ABDELLATIF AREF R FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC \n", "\n", " 3 4 5 \n", "0 WATER MGMNT $81000.00 $73862.00 \n", "1 POLICE $74628.00 $74628.00 \n", "2 FLEET MANAGEMNT $77280.00 $70174.00 \n", "3 WATER MGMNT $96276.00 $96276.00 \n", "4 WATER MGMNT $84032.00 $76627.00 \n", "5 POLICE $79926.00 $79926.00 \n", "6 FIRE $77238.00 $77238.00 \n", "7 FAMILY & SUPPORT $2756.00 $2756.00 \n", "8 FAMILY & SUPPORT $38568.00 $38568.00 \n", "9 TRANSPORTN $84032.00 $76627.00 \n", "10 POLICE $71040.00 $71040.00 \n", "11 FIRE $86922.00 $86922.00 \n" ] } ], "source": [ "print(fh) # check the column names, it've been assigned by pandas itself" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we can assign file dataframe the column names\n", "fh.columns" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " LNAME FNAME JOB TITLE \\\n", "0 AARON ELVIA J WATER RATE TAKER \n", "1 AARON JEFFERY M POLICE OFFICER \n", "2 AARON KIMBERLEI R CHIEF CONTRACT EXPEDITER \n", "3 ABAD JR VICENTE M CIVIL ENGINEER IV \n", "4 ABBATACOLA ROBERT J ELECTRICAL MECHANIC \n", "5 ABBATE TERRY M POLICE OFFICER \n", "6 ABBATEMARCO JAMES J FIREFIGHTER \n", "7 ABBOTT BETTY L FOSTER GRANDPARENT \n", "8 ABBOTT LYNISE M CLERK III \n", "9 ABBOTT SAM J ELECTRICAL MECHANIC \n", "10 ABDELHADI ABDALMAHD POLICE OFFICER \n", "11 ABDELLATIF AREF R FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC \n", "\n", " DEPARTMENT EMPLOYEE ANNUAL SALARY \\\n", "0 WATER MGMNT $81000.00 \n", "1 POLICE $74628.00 \n", "2 FLEET MANAGEMNT $77280.00 \n", "3 WATER MGMNT $96276.00 \n", "4 WATER MGMNT $84032.00 \n", "5 POLICE $79926.00 \n", "6 FIRE $77238.00 \n", "7 FAMILY & SUPPORT $2756.00 \n", "8 FAMILY & SUPPORT $38568.00 \n", "9 TRANSPORTN $84032.00 \n", "10 POLICE $71040.00 \n", "11 FIRE $86922.00 \n", "\n", " ESTIMATED ANNUAL SALARY MINUS FURLOUGHS \n", "0 $73862.00 \n", "1 $74628.00 \n", "2 $70174.00 \n", "3 $96276.00 \n", "4 $76627.00 \n", "5 $79926.00 \n", "6 $77238.00 \n", "7 $2756.00 \n", "8 $38568.00 \n", "9 $76627.00 \n", "10 $71040.00 \n", "11 $86922.00 \n" ] } ], "source": [ "fh.columns = ['LNAME','FNAME','JOB TITLE','DEPARTMENT','EMPLOYEE ANNUAL SALARY','ESTIMATED ANNUAL SALARY MINUS FURLOUGHS'\n", "]\n", "print(fh)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "12" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# or we can assign the names while reading the files\n", "fh = pd.read_table(\"dataset/Employees2.csv\", header=None, sep=',', \n", " names=['LNAME','FNAME','JOB TITLE','DEPARTMENT','EMPLOYEE ANNUAL SALARY',\n", " 'ESTIMATED ANNUAL SALARY MINUS FURLOUGHS'])\n", "len(fh)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " LNAME FNAME JOB TITLE \\\n", "0 AARON ELVIA J WATER RATE TAKER \n", "1 AARON JEFFERY M POLICE OFFICER \n", "2 AARON KIMBERLEI R CHIEF CONTRACT EXPEDITER \n", "3 ABAD JR VICENTE M CIVIL ENGINEER IV \n", "4 ABBATACOLA ROBERT J ELECTRICAL MECHANIC \n", "5 ABBATE TERRY M POLICE OFFICER \n", "6 ABBATEMARCO JAMES J FIREFIGHTER \n", "7 ABBOTT BETTY L FOSTER GRANDPARENT \n", "8 ABBOTT LYNISE M CLERK III \n", "9 ABBOTT SAM J ELECTRICAL MECHANIC \n", "10 ABDELHADI ABDALMAHD POLICE OFFICER \n", "11 ABDELLATIF AREF R FIREFIGHTER (PER ARBITRATORS AWARD)-PARAMEDIC \n", "\n", " DEPARTMENT EMPLOYEE ANNUAL SALARY \\\n", "0 WATER MGMNT $81000.00 \n", "1 POLICE $74628.00 \n", "2 FLEET MANAGEMNT $77280.00 \n", "3 WATER MGMNT $96276.00 \n", "4 WATER MGMNT $84032.00 \n", "5 POLICE $79926.00 \n", "6 FIRE $77238.00 \n", "7 FAMILY & SUPPORT $2756.00 \n", "8 FAMILY & SUPPORT $38568.00 \n", "9 TRANSPORTN $84032.00 \n", "10 POLICE $71040.00 \n", "11 FIRE $86922.00 \n", "\n", " ESTIMATED ANNUAL SALARY MINUS FURLOUGHS \n", "0 $73862.00 \n", "1 $74628.00 \n", "2 $70174.00 \n", "3 $96276.00 \n", "4 $76627.00 \n", "5 $79926.00 \n", "6 $77238.00 \n", "7 $2756.00 \n", "8 $38568.00 \n", "9 $76627.00 \n", "10 $71040.00 \n", "11 $86922.00 \n" ] } ], "source": [ "print(fh)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# In some cases, a table might not have a fixed delimiter, using whitespace or some other\n", "# pattern to separate fields. In these cases, you can pass a regular expression as a delimiter\n", "# for read_table.\n", "# pd.read_table('ch06/ex3.txt', sep='\\s+')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "9" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Skip the rows to read\n", "fh = pd.read_table(\"dataset/Employees2Head.csv\", sep=\",\", skiprows=[0,1,2])\n", "len(fh) # total was 12 rows but skipped 3 rows" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Handling missing data while reading files" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "LNAME,FNAME,JOB TITLE,DEPARTMENT,EMPLOYEE ANNUAL SALARY,ESTIMATED ANNUAL SALARY MINUS FURLOUGHS\n", "AARON,ELVIA J,,WATER MGMNT,NULL,$73862.00\n", ",JEFFERY M,POLICE OFFICER,POLICE,$74628.00,$74628.00\n", "AARON,,CHIEF CONTRACT EXPEDITER,NaN,$77280.00,$70174.00\n", "ABAD JR,VICENTE M,CIVIL ENGINEER IV,WATER MGMNT,$96276.00,NaN\n", "|,ROBERT J,,NULL,$84032.00,$76627.00\n", "ABBATE,TERRY M,POLICE OFFICER,POLICE,$79926.00,NaN\n", "NULL,JAMES J,NA,FIRE,NULL,$77238.00\n", "NaN,,FOSTER GRANDPARENT,FAMILY & SUPPORT,NA,$2756.00\n", "NA,LYNISE M,CLERK III,FAMILY & SUPPORT,$38568.00,$38568.00\n", "|,SAM J,ELECTRICAL MECHANIC,TRANSPORTN,$84032.00,$76627.00\n", "ABDELHADI,ABDALMAHD,POLICE OFFICER,POLICE,$71040.00,$71040.00\n", ",AREF R,NA,FIRE,,$86922.00\n" ] } ], "source": [ "!more \"dataset/Emp2HeadMiss.csv\"" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " LNAME FNAME JOB TITLE DEPARTMENT \\\n", "0 AARON ELVIA J NaN WATER MGMNT \n", "1 NaN JEFFERY M POLICE OFFICER POLICE \n", "2 AARON NaN CHIEF CONTRACT EXPEDITER NaN \n", "3 ABAD JR VICENTE M CIVIL ENGINEER IV WATER MGMNT \n", "4 | ROBERT J NaN NaN \n", "5 ABBATE TERRY M POLICE OFFICER POLICE \n", "6 NaN JAMES J NaN FIRE \n", "7 NaN NaN FOSTER GRANDPARENT FAMILY & SUPPORT \n", "8 NaN LYNISE M CLERK III FAMILY & SUPPORT \n", "9 | SAM J ELECTRICAL MECHANIC TRANSPORTN \n", "10 ABDELHADI ABDALMAHD POLICE OFFICER POLICE \n", "11 NaN AREF R NaN FIRE \n", "\n", " EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS \n", "0 NaN $73862.00 \n", "1 $74628.00 $74628.00 \n", "2 $77280.00 $70174.00 \n", "3 $96276.00 NaN \n", "4 $84032.00 $76627.00 \n", "5 $79926.00 NaN \n", "6 NaN $77238.00 \n", "7 NaN $2756.00 \n", "8 $38568.00 $38568.00 \n", "9 $84032.00 $76627.00 \n", "10 $71040.00 $71040.00 \n", "11 NaN $86922.00 \n" ] } ], "source": [ "# reading the file\n", "fh = pd.read_csv(\"dataset/Emp2HeadMiss.csv\")\n", "print(fh)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LNAMEFNAMEJOB TITLEDEPARTMENTEMPLOYEE ANNUAL SALARYESTIMATED ANNUAL SALARY MINUS FURLOUGHS
0FalseFalseTrueFalseTrueFalse
1TrueFalseFalseFalseFalseFalse
2FalseTrueFalseTrueFalseFalse
3FalseFalseFalseFalseFalseTrue
4FalseFalseTrueTrueFalseFalse
5FalseFalseFalseFalseFalseTrue
6TrueFalseTrueFalseTrueFalse
7TrueTrueFalseFalseTrueFalse
8TrueFalseFalseFalseFalseFalse
9FalseFalseFalseFalseFalseFalse
10FalseFalseFalseFalseFalseFalse
11TrueFalseTrueFalseTrueFalse
\n", "
" ], "text/plain": [ " LNAME FNAME JOB TITLE DEPARTMENT EMPLOYEE ANNUAL SALARY \\\n", "0 False False True False True \n", "1 True False False False False \n", "2 False True False True False \n", "3 False False False False False \n", "4 False False True True False \n", "5 False False False False False \n", "6 True False True False True \n", "7 True True False False True \n", "8 True False False False False \n", "9 False False False False False \n", "10 False False False False False \n", "11 True False True False True \n", "\n", " ESTIMATED ANNUAL SALARY MINUS FURLOUGHS \n", "0 False \n", "1 False \n", "2 False \n", "3 True \n", "4 False \n", "5 True \n", "6 False \n", "7 False \n", "8 False \n", "9 False \n", "10 False \n", "11 False " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.isnull(fh) # TRUE denotes the NULL value" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# The na_values option can take either a list or set of strings to consider missing values\n", "fh = pd.read_csv(\"dataset/Emp2HeadMiss.csv\", na_values=['NULL'])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " LNAME FNAME JOB TITLE DEPARTMENT \\\n", "0 AARON ELVIA J NaN WATER MGMNT \n", "1 NaN JEFFERY M POLICE OFFICER POLICE \n", "2 AARON NaN CHIEF CONTRACT EXPEDITER NaN \n", "3 ABAD JR VICENTE M CIVIL ENGINEER IV WATER MGMNT \n", "4 NaN ROBERT J NaN NaN \n", "5 ABBATE TERRY M POLICE OFFICER POLICE \n", "6 NaN JAMES J NaN FIRE \n", "7 NaN NaN FOSTER GRANDPARENT FAMILY & SUPPORT \n", "8 NaN LYNISE M CLERK III FAMILY & SUPPORT \n", "9 NaN SAM J ELECTRICAL MECHANIC TRANSPORTN \n", "10 ABDELHADI ABDALMAHD POLICE OFFICER POLICE \n", "11 NaN AREF R NaN FIRE \n", "\n", " EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS \n", "0 NaN $73862.00 \n", "1 $74628.00 $74628.00 \n", "2 $77280.00 $70174.00 \n", "3 $96276.00 NaN \n", "4 $84032.00 $76627.00 \n", "5 $79926.00 NaN \n", "6 NaN $77238.00 \n", "7 NaN $2756.00 \n", "8 $38568.00 $38568.00 \n", "9 $84032.00 $76627.00 \n", "10 $71040.00 $71040.00 \n", "11 NaN $86922.00 \n" ] } ], "source": [ "# we can define NA values for each columns as well\n", "naValues = {'LNAME':['NA', 'NaN', 'NULL', '|'], 'FNAME':['NULL']}\n", "fh = pd.read_csv(\"dataset/Emp2HeadMiss.csv\", na_values=naValues)\n", "print(fh)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### read_csv /read_table function arguments\n", "```\n", "path String indicating filesystem location, URL, or file-like object\n", "sep or delimiter Character sequence or regular expression to use to split fields in each row\n", "header Row number to use as column names. Defaults to 0 (first row), but should be None if there is no header row\n", "index_col Column numbers or names to use as the row index in the result. Can be a single name/number or a list of them for a hierarchical index\n", "names List of column names for result, combine with header=None\n", "skiprows Number of rows at beginning of file to ignore or list of row numbers (starting from 0) to skip\n", "na_values Sequence of values to replace with NA\n", "comment Character or characters to split comments off the end of lines\n", "parse_dates Attempt to parse data to datetime; False by default. If True, will attempt to parse all columns. Otherwise can specify a list of column numbers or name to parse. If element of list is tuple or list, will combine multiple columns together and parse to date (for example if date/time split across two columns)\n", "keep_date_col If joining columns to parse date, drop the joined columns. Default True\n", "converters Dict containing column number of name mapping to functions. For example {'foo': f} would apply the function f to all values in the 'foo' column\n", "dayfirst When parsing potentially ambiguous dates, treat as international format (e.g. 7/6/2012 -> June 7, 2012). Default False\n", "date_parser Function to use to parse dates\n", "nrows Number of rows to read from beginning of file\n", "iterator Return a TextParser object for reading file piecemeal\n", "chunksize For iteration, size of file chunks\n", "skip_footer Number of lines to ignore at end of file\n", "verbose Print various parser output information, like the number of missing values placed in non-numeric columns\n", "encoding Text encoding for unicode. For example 'utf-8' for UTF-8 encoded text\n", "squeeze If the parsed data only contains one column return a Series\n", "thousands Separator for thousands, e.g. ',' or '.'\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading Text Files in Pieces" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "34218" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fh = pd.read_csv(\"dataset/Employees.csv\")\n", "len(fh)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 34218 entries, 0 to 34217\n", "Data columns (total 6 columns):\n", "LNAME 34218 non-null object\n", "FNAME 34218 non-null object\n", "JOB TITLE 34218 non-null object\n", "DEPARTMENT 34218 non-null object\n", "EMPLOYEE ANNUAL SALARY 34218 non-null object\n", "ESTIMATED ANNUAL SALARY MINUS FURLOUGHS 34218 non-null object\n", "dtypes: object(6)\n", "memory usage: 1.6+ MB\n" ] } ], "source": [ "fh.info()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LNAMEFNAMEJOB TITLEDEPARTMENTEMPLOYEE ANNUAL SALARYESTIMATED ANNUAL SALARY MINUS FURLOUGHS
count342183421834218342183421834218
unique144311218811243910281088
topWILLIAMSMICHAEL JPOLICE OFFICERPOLICE$77238.00$77238.00
freq277309109181414730283028
\n", "
" ], "text/plain": [ " LNAME FNAME JOB TITLE DEPARTMENT EMPLOYEE ANNUAL SALARY \\\n", "count 34218 34218 34218 34218 34218 \n", "unique 14431 12188 1124 39 1028 \n", "top WILLIAMS MICHAEL J POLICE OFFICER POLICE $77238.00 \n", "freq 277 309 10918 14147 3028 \n", "\n", " ESTIMATED ANNUAL SALARY MINUS FURLOUGHS \n", "count 34218 \n", "unique 1088 \n", "top $77238.00 \n", "freq 3028 " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fh.describe()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " LNAME FNAME JOB TITLE DEPARTMENT \\\n", "0 AARON ELVIA J WATER RATE TAKER WATER MGMNT \n", "1 AARON JEFFERY M POLICE OFFICER POLICE \n", "2 AARON KIMBERLEI R CHIEF CONTRACT EXPEDITER FLEET MANAGEMNT \n", "3 ABAD JR VICENTE M CIVIL ENGINEER IV WATER MGMNT \n", "4 ABBATACOLA ROBERT J ELECTRICAL MECHANIC WATER MGMNT \n", "\n", " EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS \n", "0 $81000.00 $73862.00 \n", "1 $74628.00 $74628.00 \n", "2 $77280.00 $70174.00 \n", "3 $96276.00 $96276.00 \n", "4 $84032.00 $76627.00 \n" ] } ], "source": [ "# if want to read few lines from file\n", "fh = pd.read_csv(\"dataset/Employees.csv\", nrows=5)\n", "print(fh)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# To read out a file in pieces, specify a chunksize as a number of rows\n", "chunker = pd.read_csv(\"dataset/Employees.csv\", chunksize=1000)\n", "chunker" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "35\n" ] } ], "source": [ "line = 0\n", "for data in chunker:\n", " line = line + 1\n", "print(line)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Writing Data Out to Text Format" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "34218" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fh = pd.read_csv(\"dataset/Employees.csv\")\n", "len(fh)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# save the fh as csv\n", "fh.to_csv(\"dataset/Emp.csv\")" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# save the fh as excel\n", "fh.to_excel(\"dataset/Emp.xls\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Other delimiters can be used\n", "# fh.to_csv(\"dataset/Emp.csv\", sep=\"|\")" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# Missing values appear as empty strings in the output. You might want to denote them\n", "# by some other sentinel value:\n", "# sentinel = ['NULL']\n", "# data.to_csv(sys.stdout, na_rep=sentinel)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# With no other options specified, both the row and column labels are written. Both of\n", "# these can be disabled:\n", "fh.to_csv(\"dataset/Emp.csv\", header = False, index = False)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# can write column subset as well\n", "cols = ['LNAME', 'FNAME']\n", "fh.to_csv(\"dataset/Emp.csv\", index = False, columns=cols)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Save Series" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dates = pd.date_range('1/1/2000', periods=7)\n", "ts = pd.Series(np.arange(7), index=dates)\n", "ts.to_csv(\"dataset/times.psv\", sep=\"|\")" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2000-01-01|0\n", "2000-01-02|1\n", "2000-01-03|2\n", "2000-01-04|3\n", "2000-01-05|4\n", "2000-01-06|5\n", "2000-01-07|6\n" ] } ], "source": [ "!more \"dataset/times.psv\"" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "2000-01-01 0\n", "2000-01-02 1\n", "2000-01-03 2\n", "2000-01-04 3\n", "2000-01-05 4\n", "2000-01-06 5\n", "2000-01-07 6\n", "dtype: int64" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# With a bit of wrangling (no header, first column as index), you can read a CSV version\n", "# of a Series with read_csv, but there is also a from_csv convenience method that makes\n", "# it a bit simpler:\n", "pd.Series.from_csv('dataset/times.psv', parse_dates=True, sep=\"|\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Manually Working with Delimited Formats" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "34219\n" ] } ], "source": [ "# import csv\n", "import csv\n", "fh = open(\"dataset/Emp.csv\")\n", "reader = csv.reader(fh)\n", "\n", "count=0\n", "for line in reader:\n", " count = count+1\n", "print(count)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['LNAME', 'FNAME']" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lines = list(csv.reader(open(\"dataset/Emp.csv\")))\n", "header, values = lines[0], lines[1:]\n", "header" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "dict_keys(['LNAME', 'FNAME'])" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_dict = {h: v for h, v in zip(header, zip(*values))}\n", "data_dict.keys()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<_csv.reader object at 0x0000017847958D48>\n" ] } ], "source": [ "# Defining a new format with a different delimiter, string quoting convention, \n", "# or line terminator is done by defining a simple subclass of csv.Dialect\n", "f = open(\"dataset/times.psv\")\n", "\n", "class my_dialect(csv.Dialect):\n", " lineterminator = '\\n'\n", " delimiter = '|'\n", " quoting = csv.QUOTE_NONE\n", " \n", "reader = csv.reader(f, dialect=my_dialect)\n", "print(reader)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['2000-01-01', '0']\n", "['2000-01-02', '1']\n", "['2000-01-03', '2']\n", "['2000-01-04', '3']\n", "['2000-01-05', '4']\n", "['2000-01-06', '5']\n", "['2000-01-07', '6']\n" ] } ], "source": [ "# or we can define as\n", "reader = csv.reader(open(\"dataset/times.psv\"), delimiter = '|')\n", "for line in reader:\n", " print(line)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### CSV dialect options\n", "\n", "```\n", "delimiter One-character string to separate fields. Defaults to ','.\n", "lineterminator Line terminator for writing, defaults to '\\r\\n'. Reader ignores this and recognizes cross-platform line terminators.\n", "quotechar Quote character for fields with special characters (like a delimiter). Default is '\"'.\n", "quoting Quoting convention. Options include csv.QUOTE_ALL (quote all fields),\n", " csv.QUOTE_MINIMAL (only fields with special characters like the delimiter),\n", " csv.QUOTE_NONNUMERIC, and csv.QUOTE_NON (no quoting). See Python’s documentation for full details. Defaults to QUOTE_MINIMAL.\n", "skipinitialspace Ignore whitespace after each delimiter. Default False.\n", "doublequote How to handle quoting character inside a field. If True, it is doubled. \n", "escapechar String to escape the delimiter if quoting is set to csv.QUOTE_NONE. Disabled by default\n", "```" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# To write delimited files manually, use csv.writer.\n", "with open('dataset/mydata.csv', 'w') as f:\n", " writer = csv.writer(f, dialect=my_dialect)\n", " writer.writerow(('one', 'two', 'three'))\n", " writer.writerow(('1', '2', '3'))\n", " writer.writerow(('4', '5', '6'))\n", " writer.writerow(('7', '8', '9'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### JSON Data" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'\\n{\"name\": \"Wes\",\\n \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\\n \"pet\": null,\\n \"siblings\": [{\"name\": \"Scott\", \"age\": 25, \"pet\": \"Zuko\"},\\n {\"name\": \"Katie\", \"age\": 33, \"pet\": \"Cisco\"}]\\n}\\n'" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj = \"\"\"\n", "{\"name\": \"Wes\",\n", " \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\n", " \"pet\": null,\n", " \"siblings\": [{\"name\": \"Scott\", \"age\": 25, \"pet\": \"Zuko\"},\n", " {\"name\": \"Katie\", \"age\": 33, \"pet\": \"Cisco\"}]\n", "}\n", "\"\"\"\n", "obj" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'name': 'Wes',\n", " 'pet': None,\n", " 'places_lived': ['United States', 'Spain', 'Germany'],\n", " 'siblings': [{'age': 25, 'name': 'Scott', 'pet': 'Zuko'},\n", " {'age': 33, 'name': 'Katie', 'pet': 'Cisco'}]}" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import json\n", "data = json.loads(obj)\n", "data" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'{\"name\": \"Wes\", \"siblings\": [{\"name\": \"Scott\", \"pet\": \"Zuko\", \"age\": 25}, {\"name\": \"Katie\", \"pet\": \"Cisco\", \"age\": 33}], \"places_lived\": [\"United States\", \"Spain\", \"Germany\"], \"pet\": null}'" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "asjson = json.dumps(data)\n", "asjson" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameage
0Scott25
1Katie33
\n", "
" ], "text/plain": [ " name age\n", "0 Scott 25\n", "1 Katie 33" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "siblings = pd.DataFrame(data['siblings'], columns=['name', 'age'])\n", "siblings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### XML and HTML: Web Scraping" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# lxml pkg\n", "from lxml.html import parse\n", "# from urllib2 import urlopen # for python 2\n", "from urllib.request import urlopen" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# this will download some data from net\n", "#parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))\n", "parsed = parse(urlopen('http://finance.yahoo.com'))\n", "#parsed = parse(urlopen('http://www.omdbapi.com/'))\n", "doc = parsed.getroot()" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "print(doc)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "49" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# to fetch all hyperlinks \n" ] } ], "source": [ "## Finding the tables in html \n", "tables = doc.findall('.//table')\n", "print(len(tables))\n", "\n", "table = tables[0]\n", "print(table)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[]\n" ] } ], "source": [ "# table will have only 1 table\n", "# get td tag\n", "rows = table.findall('.//tr')\n", "print(rows)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# For the header as well as the data rows, we want to extract the text from each cell; in\n", "# the case of the header these are th cells and td cells for the data:\n", "def _unpack(row, kind='td'):\n", " elts = row.findall('.//%s' % kind)\n", " return [val.text_content() for val in elts]" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# printing table header th tag\n", "_unpack(rows[0], kind='th')" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['', 'Search']" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# printing table data td tag\n", "_unpack(rows[0], kind='td')" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# combining all the code into one func\n", "from pandas.io.parsers import TextParser\n", "def parse_options_data(table):\n", " rows = table.findall('.//tr')\n", " header = _unpack(rows[0], kind='th')\n", " data = [_unpack(r) for r in rows[1:]]\n", " return TextParser(data, names=header).get_chunk()" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [ { "ename": "EmptyDataError", "evalue": "No columns to parse from file", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)", "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_next_line\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1986\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1987\u001b[0;31m \u001b[0mline\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_comments\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpos\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1988\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpos\u001b[0m \u001b[1;33m+=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[0;31mIndexError\u001b[0m: list index out of range", "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[0;31mStopIteration\u001b[0m Traceback (most recent call last)", "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_infer_columns\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1906\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1907\u001b[0;31m \u001b[0mline\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_buffered_line\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1908\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_buffered_line\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1974\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1975\u001b[0;31m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_next_line\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1976\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_next_line\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1998\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mIndexError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1999\u001b[0;31m \u001b[1;32mraise\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2000\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[0;31mStopIteration\u001b[0m: ", "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[0;31mEmptyDataError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparse_options_data\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtable\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mparse_options_data\u001b[0;34m(table)\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mheader\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_unpack\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrows\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkind\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'th'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0m_unpack\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mr\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mr\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrows\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[1;32mreturn\u001b[0m \u001b[0mTextParser\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mheader\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_chunk\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mTextParser\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m 1470\u001b[0m \"\"\"\n\u001b[1;32m 1471\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'engine'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'python'\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1472\u001b[0;31m \u001b[1;32mreturn\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1473\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 1474\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 643\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'has_index_names'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'has_index_names'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 644\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m--> 645\u001b[0;31m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 646\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 647\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[0;34m(self, engine)\u001b[0m\n\u001b[1;32m 803\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mengine\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'python-fwf'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 804\u001b[0m \u001b[0mklass\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mFixedWidthFieldParser\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m--> 805\u001b[0;31m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mklass\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 806\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 807\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_failover_to_python\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, f, **kwds)\u001b[0m\n\u001b[1;32m 1606\u001b[0m \u001b[1;31m# infer column indices from self.usecols if is is specified.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 1607\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_col_indices\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1608\u001b[0;31m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnum_original_columns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_infer_columns\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1609\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 1610\u001b[0m \u001b[1;31m# Now self.columns has the set of columns that we will process.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[0;32mC:\\tools\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_infer_columns\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1910\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mnames\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 1911\u001b[0m raise EmptyDataError(\n\u001b[0;32m-> 1912\u001b[0;31m \"No columns to parse from file\")\n\u001b[0m\u001b[1;32m 1913\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m 1914\u001b[0m \u001b[0mline\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnames\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[0;31mEmptyDataError\u001b[0m: No columns to parse from file" ] } ], "source": [ "data = parse_options_data(table)\n", "print(data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Parsing XML with lxml.objectify" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from lxml import objectify\n", "parsed = objectify.parse(open(\"dataset/Performance_MNR.xml\"))\n", "root = parsed.getroot()" ] }, { "cell_type": "code", "execution_count": 61, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "print(root)" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data = []\n", "skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',\n", " 'DESIRED_CHANGE', 'DECIMAL_PLACES']\n", "for elt in root.INDICATOR:\n", " el_data = {}\n", " for child in elt.getchildren():\n", " if child.tag in skip_fields:\n", " continue\n", " el_data[child.tag] = child.pyval\n", " data.append(el_data)" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": true }, "outputs": [], "source": [ "perf = pd.DataFrame(data)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AGENCY_NAMECATEGORYDESCRIPTIONFREQUENCYINDICATOR_NAMEINDICATOR_UNITMONTHLY_ACTUALMONTHLY_TARGETPERIOD_MONTHPERIOD_YEARYTD_ACTUALYTD_TARGET
0Metro-North RailroadService IndicatorsPercent of commuter trains that arrive at thei...MOn-Time Performance (West of Hudson)%96.9951200896.995
1Metro-North RailroadService IndicatorsPercent of commuter trains that arrive at thei...MOn-Time Performance (West of Hudson)%9595220089695
2Metro-North RailroadService IndicatorsPercent of commuter trains that arrive at thei...MOn-Time Performance (West of Hudson)%96.9953200896.395
3Metro-North RailroadService IndicatorsPercent of commuter trains that arrive at thei...MOn-Time Performance (West of Hudson)%98.3954200896.895
4Metro-North RailroadService IndicatorsPercent of commuter trains that arrive at thei...MOn-Time Performance (West of Hudson)%95.8955200896.695
\n", "
" ], "text/plain": [ " AGENCY_NAME CATEGORY \\\n", "0 Metro-North Railroad Service Indicators \n", "1 Metro-North Railroad Service Indicators \n", "2 Metro-North Railroad Service Indicators \n", "3 Metro-North Railroad Service Indicators \n", "4 Metro-North Railroad Service Indicators \n", "\n", " DESCRIPTION FREQUENCY \\\n", "0 Percent of commuter trains that arrive at thei... M \n", "1 Percent of commuter trains that arrive at thei... M \n", "2 Percent of commuter trains that arrive at thei... M \n", "3 Percent of commuter trains that arrive at thei... M \n", "4 Percent of commuter trains that arrive at thei... M \n", "\n", " INDICATOR_NAME INDICATOR_UNIT MONTHLY_ACTUAL \\\n", "0 On-Time Performance (West of Hudson) % 96.9 \n", "1 On-Time Performance (West of Hudson) % 95 \n", "2 On-Time Performance (West of Hudson) % 96.9 \n", "3 On-Time Performance (West of Hudson) % 98.3 \n", "4 On-Time Performance (West of Hudson) % 95.8 \n", "\n", " MONTHLY_TARGET PERIOD_MONTH PERIOD_YEAR YTD_ACTUAL YTD_TARGET \n", "0 95 1 2008 96.9 95 \n", "1 95 2 2008 96 95 \n", "2 95 3 2008 96.3 95 \n", "3 95 4 2008 96.8 95 \n", "4 95 5 2008 96.6 95 " ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "perf.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }