{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "################################################################\n", "## ImportingData #1.1\n", "## Atul Singh\n", "## www.datagenx.net\n", "################################################################" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Importing Data #1\n", "### #1.1 Introduction with flat files" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# import \n", "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Magic commands" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Volume in drive C is OS\n", " Volume Serial Number is D202-8009\n", "\n", " Directory of C:\\learn\\Git\\MachineLearning\\python_DC\n", "\n", "21-12-2016 14:57 .\n", "21-12-2016 14:57 ..\n", "21-12-2016 13:38 .ipynb_checkpoints\n", "21-12-2016 14:51 1,038 ImportingData_#1.ipynb\n", "21-12-2016 14:57 108,285 titanic.csv\n", "21-12-2016 13:29 513,536 titanic.xls\n", " 3 File(s) 622,859 bytes\n", " 3 Dir(s) 311,402,659,840 bytes free\n" ] } ], "source": [ "!dir" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### reading the file, read only" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "fh = open(\"dataset/titanic.txt\", \"r\")\n", "data = fh.read()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# print(data)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# closing the file handler\n", "fh.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Importing file line by line with func 'open'" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1311\n" ] } ], "source": [ "fh = open(\"dataset/titanic.txt\", \"r\")\n", "count = 0\n", "for line in fh:\n", " count = count+1\n", "print(count)\n", " " ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1311\n" ] } ], "source": [ "count = 0\n", "with open(\"dataset/titanic.txt\",\"r\") as fh:\n", " for line in fh:\n", " count = count+1\n", " print(count)\n", " " ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "pclass|survived|name|sex|age|sibsp|parch|ticket|fare|cabin|embarked|boat|body|home.dest\n", "\n", "1|1|Allen, Miss. Elisabeth Walton|female|29|0|0|24160|211.3375|B5|S|2||St Louis, MO\n", "\n", "1|1|Allison, Master. Hudson Trevor|male|0.9167|1|2|113781|151.5500|C22 C26|S|11||Montreal, PQ / Chesterville, ON\n", "\n" ] } ], "source": [ "with open(\"dataset/titanic.txt\",\"r\") as fh:\n", " print(fh.readline()) #print 1st line\n", " print(fh.readline()) #print 2nd line\n", " print(fh.readline()) #print 3rd line " ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The Zen of Python, by Tim Peters\n", "\n", "Beautiful is better than ugly.\n", "Explicit is better than implicit.\n", "Simple is better than complex.\n", "Complex is better than complicated.\n", "Flat is better than nested.\n", "Sparse is better than dense.\n", "Readability counts.\n", "Special cases aren't special enough to break the rules.\n", "Although practicality beats purity.\n", "Errors should never pass silently.\n", "Unless explicitly silenced.\n", "In the face of ambiguity, refuse the temptation to guess.\n", "There should be one-- and preferably only one --obvious way to do it.\n", "Although that way may not be obvious at first unless you're Dutch.\n", "Now is better than never.\n", "Although never is often better than *right* now.\n", "If the implementation is hard to explain, it's a bad idea.\n", "If the implementation is easy to explain, it may be a good idea.\n", "Namespaces are one honking great idea -- let's do more of those!\n" ] } ], "source": [ "# Zen of python\n", "import this" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading file with numpy 'loadtxt' func" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[ 1. 2. 3. 4. 5. 6. 7.]\n", " [ 8. 9. 0. 1. 2. 3. 4.]\n", " [ 6. 7. 8. 9. 4. 2. 4.]\n", " [ 3. 4. 6. 3. 2. 5. 7.]\n", " [ 0. 9. 7. 4. 7. 5. 3.]]\n" ] } ], "source": [ "# reading file in numpy\n", "#fhand = np.loadtxt(\"dataset/Employees.csv\", delimiter=',')\n", "# after trying so many files I got to know that we cant read non numeric data with this loadtxt method\n", "fhand = np.loadtxt(\"dataset/numbers.txt\", delimiter=',')\n", "type(fhand)\n", "print(fhand)\n", "#for i in fhand:\n", "# print(i)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[ 6. 9.]\n", " [ 3. 3.]\n", " [ 0. 4.]]\n" ] } ], "source": [ "fhand = np.loadtxt(\"dataset/numbers.txt\", delimiter=',', skiprows=2, usecols=[0,3]) # 1st and 4th column\n", "print(fhand)\n" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[\"b'pclass'\" \"b'survived'\" \"b'name'\" \"b'sex'\" \"b'age'\" \"b'sibsp'\"\n", " \"b'parch'\" \"b'ticket'\" \"b'fare'\" \"b'cabin'\" \"b'embarked'\" \"b'boat'\"\n", " \"b'body'\" \"b'home.dest'\"]\n", " [\"b'1'\" \"b'1'\" \"b'Allen, Miss. Elisabeth Walton'\" \"b'female'\" \"b'29'\"\n", " \"b'0'\" \"b'0'\" \"b'24160'\" \"b'211.3375'\" \"b'B5'\" \"b'S'\" \"b'2'\" \"b''\"\n", " \"b'St Louis, MO'\"]\n", " [\"b'1'\" \"b'1'\" \"b'Allison, Master. Hudson Trevor'\" \"b'male'\" \"b'0.9167'\"\n", " \"b'1'\" \"b'2'\" \"b'113781'\" \"b'151.5500'\" \"b'C22 C26'\" \"b'S'\" \"b'11'\" \"b''\"\n", " \"b'Montreal, PQ / Chesterville, ON'\"]\n", " [\"b'1'\" \"b'0'\" \"b'Allison, Miss. Helen Loraine'\" \"b'female'\" \"b'2'\" \"b'1'\"\n", " \"b'2'\" \"b'113781'\" \"b'151.5500'\" \"b'C22 C26'\" \"b'S'\" \"b''\" \"b''\"\n", " \"b'Montreal, PQ / Chesterville, ON'\"]]\n" ] } ], "source": [ "# by default, loadtxt func will take the numeric data but we can force to read the str data with dtype option\n", "fhand = np.loadtxt(\"dataset/titanic.txt\", delimiter='|', dtype=str)\n", "print(fhand[0:4])" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[\"b'1'\" \"b'1'\" \"b'Allen, Miss. Elisabeth Walton'\" \"b'female'\" \"b'29'\"]\n", " [\"b'1'\" \"b'1'\" \"b'Allison, Master. Hudson Trevor'\" \"b'male'\" \"b'0.9167'\"]\n", " [\"b'1'\" \"b'0'\" \"b'Allison, Miss. Helen Loraine'\" \"b'female'\" \"b'2'\"]\n", " [\"b'1'\" \"b'0'\" \"b'Allison, Mr. Hudson Joshua Creighton'\" \"b'male'\" \"b'30'\"]]\n" ] } ], "source": [ "# skip the header\n", "fhand = np.loadtxt(\"dataset/titanic.txt\", delimiter='|', dtype=str, skiprows=1, usecols=[0,1,2,3,4])\n", "print(fhand[0:4])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reading file with numpy 'genfromtxt' func" ] }, { "cell_type": "code", "execution_count": 66, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(1, 1, b'Allen, Miss. Elisabeth Walton', b'female', 29.0)\n", " (1, 1, b'Allison, Master. Hudson Trevor', b'male', 0.9167)\n", " (1, 0, b'Allison, Miss. Helen Loraine', b'female', 2.0)\n", " (1, 0, b'Allison, Mr. Hudson Joshua Creighton', b'male', 30.0)]\n" ] } ], "source": [ "fhand = np.genfromtxt(\"dataset/titanic.txt\", delimiter='|', dtype=None, names=True, usecols=[0,1,2,3,4])\n", "print(fhand[0:4])\n", "\n", "# dtype=None, if set, np identify the data type\n", "# names=True, if set, first line is header" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Readinf file with numpy 'recfromcsv' func\n", "``` In this func, below are the defaults\n", " dtype is None\n", " delimiter is ',' \n", " names is True ```" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(1, 1, b'Allen, Miss. Elisabeth Walton', b'female', 29.0)\n", " (1, 1, b'Allison, Master. Hudson Trevor', b'male', 0.9167)\n", " (1, 0, b'Allison, Miss. Helen Loraine', b'female', 2.0)\n", " (1, 0, b'Allison, Mr. Hudson Joshua Creighton', b'male', 30.0)]\n" ] } ], "source": [ "fhand = np.recfromcsv(\"dataset/titanic.txt\", delimiter='|', names=True, usecols=[0,1,2,3,4])\n", "print(fhand[0:4])" ] }, { "cell_type": "code", "execution_count": 78, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ (b'AARON', b'ELVIA J', b'WATER RATE TAKER', b'WATER MGMNT', b'$81000.00', b'$73862.00')\n", " (b'AARON', b'JEFFERY M', b'POLICE OFFICER', b'POLICE', b'$74628.00', b'$74628.00')\n", " (b'AARON', b'KIMBERLEI R', b'CHIEF CONTRACT EXPEDITER', b'FLEET MANAGEMNT', b'$77280.00', b'$70174.00')\n", " (b'ABAD JR', b'VICENTE M', b'CIVIL ENGINEER IV', b'WATER MGMNT', b'$96276.00', b'$96276.00')]\n" ] } ], "source": [ "fhand = np.recfromcsv(\"dataset/Employees.csv\")\n", "print(fhand[0:4])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using Pandas read_csv func to read file" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " LNAME FNAME JOB TITLE DEPARTMENT \\\n", "0 AARON ELVIA J WATER RATE TAKER WATER MGMNT \n", "1 AARON JEFFERY M POLICE OFFICER POLICE \n", "2 AARON KIMBERLEI R CHIEF CONTRACT EXPEDITER FLEET MANAGEMNT \n", "3 ABAD JR VICENTE M CIVIL ENGINEER IV WATER MGMNT \n", "4 ABBATACOLA ROBERT J ELECTRICAL MECHANIC WATER MGMNT \n", "\n", " EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS \n", "0 $81000.00 $73862.00 \n", "1 $74628.00 $74628.00 \n", "2 $77280.00 $70174.00 \n", "3 $96276.00 $96276.00 \n", "4 $84032.00 $76627.00 \n" ] } ], "source": [ "fh = pd.read_csv(\"dataset/Employees.csv\")\n", "print(fh[0:5])" ] }, { "cell_type": "code", "execution_count": 82, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LNAMEFNAMEJOB TITLEDEPARTMENTEMPLOYEE ANNUAL SALARYESTIMATED ANNUAL SALARY MINUS FURLOUGHS
0AARONELVIA JWATER RATE TAKERWATER MGMNT$81000.00$73862.00
1AARONJEFFERY MPOLICE OFFICERPOLICE$74628.00$74628.00
2AARONKIMBERLEI RCHIEF CONTRACT EXPEDITERFLEET MANAGEMNT$77280.00$70174.00
3ABAD JRVICENTE MCIVIL ENGINEER IVWATER MGMNT$96276.00$96276.00
\n", "
" ], "text/plain": [ " LNAME FNAME JOB TITLE DEPARTMENT \\\n", "0 AARON ELVIA J WATER RATE TAKER WATER MGMNT \n", "1 AARON JEFFERY M POLICE OFFICER POLICE \n", "2 AARON KIMBERLEI R CHIEF CONTRACT EXPEDITER FLEET MANAGEMNT \n", "3 ABAD JR VICENTE M CIVIL ENGINEER IV WATER MGMNT \n", "\n", " EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS \n", "0 $81000.00 $73862.00 \n", "1 $74628.00 $74628.00 \n", "2 $77280.00 $70174.00 \n", "3 $96276.00 $96276.00 " ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fh.head(4) " ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456
01234567
18901234
26789424
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6\n", "0 1 2 3 4 5 6 7\n", "1 8 9 0 1 2 3 4\n", "2 6 7 8 9 4 2 4" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Some more function\n", "fh_pd = pd.read_csv(\"dataset/numbers.txt\", header=None, nrows=3)\n", "fh_pd" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[1, 2, 3, 4, 5, 6, 7],\n", " [8, 9, 0, 1, 2, 3, 4],\n", " [6, 7, 8, 9, 4, 2, 4]], dtype=int64)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Values only\n", "fh_pd.values" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([[1, 2, 3, 4, 5, 6, 7],\n", " [8, 9, 0, 1, 2, 3, 4],\n", " [6, 7, 8, 9, 4, 2, 4]], dtype=int64)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# creating numpy array\n", "narr = np.array(fh_pd.values)\n", "narr" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " \n" ] } ], "source": [ "print(type(fh_pd), type(fh_pd.values), type(narr))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "#pd.read_csv(file, sep=\"\\t\", comment=\"#\", na_values=[\"Nothing\"])\n", "# comment - ignore the data if it start with #\n", "# na_values - convert Nothing to NA or NaN (python value for missing data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "############################################################\n", "## Atul Singh | www.datagenx.net | lnked.in/atulsingh\n", "############################################################" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 0 }