{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "################################################################\n",
    "##  ImportingData #1.1\n",
    "##  Atul Singh\n",
    "##  www.datagenx.net\n",
    "################################################################"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Importing Data #1\n",
    "### #1.1 Introduction with flat files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# import \n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Magic commands"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " Volume in drive C is OS\n",
      " Volume Serial Number is D202-8009\n",
      "\n",
      " Directory of C:\\learn\\Git\\MachineLearning\\python_DC\n",
      "\n",
      "21-12-2016  14:57    <DIR>          .\n",
      "21-12-2016  14:57    <DIR>          ..\n",
      "21-12-2016  13:38    <DIR>          .ipynb_checkpoints\n",
      "21-12-2016  14:51             1,038 ImportingData_#1.ipynb\n",
      "21-12-2016  14:57           108,285 titanic.csv\n",
      "21-12-2016  13:29           513,536 titanic.xls\n",
      "               3 File(s)        622,859 bytes\n",
      "               3 Dir(s)  311,402,659,840 bytes free\n"
     ]
    }
   ],
   "source": [
    "!dir"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### reading the file, read only"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "fh = open(\"dataset/titanic.txt\", \"r\")\n",
    "data = fh.read()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# print(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# closing the file handler\n",
    "fh.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Importing file line by line with func 'open'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1311\n"
     ]
    }
   ],
   "source": [
    "fh = open(\"dataset/titanic.txt\", \"r\")\n",
    "count = 0\n",
    "for line in fh:\n",
    "    count = count+1\n",
    "print(count)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1311\n"
     ]
    }
   ],
   "source": [
    "count = 0\n",
    "with open(\"dataset/titanic.txt\",\"r\") as fh:\n",
    "    for line in fh:\n",
    "        count = count+1\n",
    "    print(count)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pclass|survived|name|sex|age|sibsp|parch|ticket|fare|cabin|embarked|boat|body|home.dest\n",
      "\n",
      "1|1|Allen, Miss. Elisabeth Walton|female|29|0|0|24160|211.3375|B5|S|2||St Louis, MO\n",
      "\n",
      "1|1|Allison, Master. Hudson Trevor|male|0.9167|1|2|113781|151.5500|C22 C26|S|11||Montreal, PQ / Chesterville, ON\n",
      "\n"
     ]
    }
   ],
   "source": [
    "with open(\"dataset/titanic.txt\",\"r\") as fh:\n",
    "    print(fh.readline())  #print 1st line\n",
    "    print(fh.readline())  #print 2nd line\n",
    "    print(fh.readline())  #print 3rd line    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The Zen of Python, by Tim Peters\n",
      "\n",
      "Beautiful is better than ugly.\n",
      "Explicit is better than implicit.\n",
      "Simple is better than complex.\n",
      "Complex is better than complicated.\n",
      "Flat is better than nested.\n",
      "Sparse is better than dense.\n",
      "Readability counts.\n",
      "Special cases aren't special enough to break the rules.\n",
      "Although practicality beats purity.\n",
      "Errors should never pass silently.\n",
      "Unless explicitly silenced.\n",
      "In the face of ambiguity, refuse the temptation to guess.\n",
      "There should be one-- and preferably only one --obvious way to do it.\n",
      "Although that way may not be obvious at first unless you're Dutch.\n",
      "Now is better than never.\n",
      "Although never is often better than *right* now.\n",
      "If the implementation is hard to explain, it's a bad idea.\n",
      "If the implementation is easy to explain, it may be a good idea.\n",
      "Namespaces are one honking great idea -- let's do more of those!\n"
     ]
    }
   ],
   "source": [
    "# Zen of python\n",
    "import this"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reading file with numpy 'loadtxt' func"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 1.  2.  3.  4.  5.  6.  7.]\n",
      " [ 8.  9.  0.  1.  2.  3.  4.]\n",
      " [ 6.  7.  8.  9.  4.  2.  4.]\n",
      " [ 3.  4.  6.  3.  2.  5.  7.]\n",
      " [ 0.  9.  7.  4.  7.  5.  3.]]\n"
     ]
    }
   ],
   "source": [
    "# reading file in numpy\n",
    "#fhand = np.loadtxt(\"dataset/Employees.csv\", delimiter=',')\n",
    "# after trying so many files I got to know that we cant read non numeric data with this loadtxt method\n",
    "fhand = np.loadtxt(\"dataset/numbers.txt\", delimiter=',')\n",
    "type(fhand)\n",
    "print(fhand)\n",
    "#for i in fhand:\n",
    "#    print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[ 6.  9.]\n",
      " [ 3.  3.]\n",
      " [ 0.  4.]]\n"
     ]
    }
   ],
   "source": [
    "fhand = np.loadtxt(\"dataset/numbers.txt\", delimiter=',', skiprows=2, usecols=[0,3])  # 1st and 4th column\n",
    "print(fhand)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[\"b'pclass'\" \"b'survived'\" \"b'name'\" \"b'sex'\" \"b'age'\" \"b'sibsp'\"\n",
      "  \"b'parch'\" \"b'ticket'\" \"b'fare'\" \"b'cabin'\" \"b'embarked'\" \"b'boat'\"\n",
      "  \"b'body'\" \"b'home.dest'\"]\n",
      " [\"b'1'\" \"b'1'\" \"b'Allen, Miss. Elisabeth Walton'\" \"b'female'\" \"b'29'\"\n",
      "  \"b'0'\" \"b'0'\" \"b'24160'\" \"b'211.3375'\" \"b'B5'\" \"b'S'\" \"b'2'\" \"b''\"\n",
      "  \"b'St Louis, MO'\"]\n",
      " [\"b'1'\" \"b'1'\" \"b'Allison, Master. Hudson Trevor'\" \"b'male'\" \"b'0.9167'\"\n",
      "  \"b'1'\" \"b'2'\" \"b'113781'\" \"b'151.5500'\" \"b'C22 C26'\" \"b'S'\" \"b'11'\" \"b''\"\n",
      "  \"b'Montreal, PQ / Chesterville, ON'\"]\n",
      " [\"b'1'\" \"b'0'\" \"b'Allison, Miss. Helen Loraine'\" \"b'female'\" \"b'2'\" \"b'1'\"\n",
      "  \"b'2'\" \"b'113781'\" \"b'151.5500'\" \"b'C22 C26'\" \"b'S'\" \"b''\" \"b''\"\n",
      "  \"b'Montreal, PQ / Chesterville, ON'\"]]\n"
     ]
    }
   ],
   "source": [
    "# by default, loadtxt func will take the numeric data but we can force to read the str data with dtype option\n",
    "fhand = np.loadtxt(\"dataset/titanic.txt\", delimiter='|', dtype=str)\n",
    "print(fhand[0:4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[\"b'1'\" \"b'1'\" \"b'Allen, Miss. Elisabeth Walton'\" \"b'female'\" \"b'29'\"]\n",
      " [\"b'1'\" \"b'1'\" \"b'Allison, Master. Hudson Trevor'\" \"b'male'\" \"b'0.9167'\"]\n",
      " [\"b'1'\" \"b'0'\" \"b'Allison, Miss. Helen Loraine'\" \"b'female'\" \"b'2'\"]\n",
      " [\"b'1'\" \"b'0'\" \"b'Allison, Mr. Hudson Joshua Creighton'\" \"b'male'\" \"b'30'\"]]\n"
     ]
    }
   ],
   "source": [
    "# skip the header\n",
    "fhand = np.loadtxt(\"dataset/titanic.txt\", delimiter='|', dtype=str, skiprows=1, usecols=[0,1,2,3,4])\n",
    "print(fhand[0:4])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Reading file with numpy 'genfromtxt' func"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(1, 1, b'Allen, Miss. Elisabeth Walton', b'female', 29.0)\n",
      " (1, 1, b'Allison, Master. Hudson Trevor', b'male', 0.9167)\n",
      " (1, 0, b'Allison, Miss. Helen Loraine', b'female', 2.0)\n",
      " (1, 0, b'Allison, Mr. Hudson Joshua Creighton', b'male', 30.0)]\n"
     ]
    }
   ],
   "source": [
    "fhand = np.genfromtxt(\"dataset/titanic.txt\", delimiter='|', dtype=None,  names=True, usecols=[0,1,2,3,4])\n",
    "print(fhand[0:4])\n",
    "\n",
    "# dtype=None, if set, np identify the data type\n",
    "# names=True, if set, first line is header"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Readinf file with numpy 'recfromcsv' func\n",
    "``` In this func, below are the defaults\n",
    "      dtype is None\n",
    "      delimiter is ',' \n",
    "      names is True ```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(1, 1, b'Allen, Miss. Elisabeth Walton', b'female', 29.0)\n",
      " (1, 1, b'Allison, Master. Hudson Trevor', b'male', 0.9167)\n",
      " (1, 0, b'Allison, Miss. Helen Loraine', b'female', 2.0)\n",
      " (1, 0, b'Allison, Mr. Hudson Joshua Creighton', b'male', 30.0)]\n"
     ]
    }
   ],
   "source": [
    "fhand = np.recfromcsv(\"dataset/titanic.txt\", delimiter='|', names=True, usecols=[0,1,2,3,4])\n",
    "print(fhand[0:4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ (b'AARON', b'ELVIA J', b'WATER RATE TAKER', b'WATER MGMNT', b'$81000.00', b'$73862.00')\n",
      " (b'AARON', b'JEFFERY M', b'POLICE OFFICER', b'POLICE', b'$74628.00', b'$74628.00')\n",
      " (b'AARON', b'KIMBERLEI R', b'CHIEF CONTRACT EXPEDITER', b'FLEET MANAGEMNT', b'$77280.00', b'$70174.00')\n",
      " (b'ABAD JR', b'VICENTE M', b'CIVIL ENGINEER IV', b'WATER MGMNT', b'$96276.00', b'$96276.00')]\n"
     ]
    }
   ],
   "source": [
    "fhand = np.recfromcsv(\"dataset/Employees.csv\")\n",
    "print(fhand[0:4])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Using Pandas read_csv func to read file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "        LNAME        FNAME                 JOB TITLE       DEPARTMENT  \\\n",
      "0       AARON      ELVIA J          WATER RATE TAKER      WATER MGMNT   \n",
      "1       AARON    JEFFERY M            POLICE OFFICER           POLICE   \n",
      "2       AARON  KIMBERLEI R  CHIEF CONTRACT EXPEDITER  FLEET MANAGEMNT   \n",
      "3     ABAD JR    VICENTE M         CIVIL ENGINEER IV      WATER MGMNT   \n",
      "4  ABBATACOLA     ROBERT J       ELECTRICAL MECHANIC      WATER MGMNT   \n",
      "\n",
      "  EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS  \n",
      "0              $81000.00                               $73862.00  \n",
      "1              $74628.00                               $74628.00  \n",
      "2              $77280.00                               $70174.00  \n",
      "3              $96276.00                               $96276.00  \n",
      "4              $84032.00                               $76627.00  \n"
     ]
    }
   ],
   "source": [
    "fh = pd.read_csv(\"dataset/Employees.csv\")\n",
    "print(fh[0:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>LNAME</th>\n",
       "      <th>FNAME</th>\n",
       "      <th>JOB TITLE</th>\n",
       "      <th>DEPARTMENT</th>\n",
       "      <th>EMPLOYEE ANNUAL SALARY</th>\n",
       "      <th>ESTIMATED ANNUAL SALARY MINUS FURLOUGHS</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AARON</td>\n",
       "      <td>ELVIA J</td>\n",
       "      <td>WATER RATE TAKER</td>\n",
       "      <td>WATER MGMNT</td>\n",
       "      <td>$81000.00</td>\n",
       "      <td>$73862.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AARON</td>\n",
       "      <td>JEFFERY M</td>\n",
       "      <td>POLICE OFFICER</td>\n",
       "      <td>POLICE</td>\n",
       "      <td>$74628.00</td>\n",
       "      <td>$74628.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>AARON</td>\n",
       "      <td>KIMBERLEI R</td>\n",
       "      <td>CHIEF CONTRACT EXPEDITER</td>\n",
       "      <td>FLEET MANAGEMNT</td>\n",
       "      <td>$77280.00</td>\n",
       "      <td>$70174.00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ABAD JR</td>\n",
       "      <td>VICENTE M</td>\n",
       "      <td>CIVIL ENGINEER IV</td>\n",
       "      <td>WATER MGMNT</td>\n",
       "      <td>$96276.00</td>\n",
       "      <td>$96276.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     LNAME        FNAME                 JOB TITLE       DEPARTMENT  \\\n",
       "0    AARON      ELVIA J          WATER RATE TAKER      WATER MGMNT   \n",
       "1    AARON    JEFFERY M            POLICE OFFICER           POLICE   \n",
       "2    AARON  KIMBERLEI R  CHIEF CONTRACT EXPEDITER  FLEET MANAGEMNT   \n",
       "3  ABAD JR    VICENTE M         CIVIL ENGINEER IV      WATER MGMNT   \n",
       "\n",
       "  EMPLOYEE ANNUAL SALARY ESTIMATED ANNUAL SALARY MINUS FURLOUGHS  \n",
       "0              $81000.00                               $73862.00  \n",
       "1              $74628.00                               $74628.00  \n",
       "2              $77280.00                               $70174.00  \n",
       "3              $96276.00                               $96276.00  "
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fh.head(4) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   0  1  2  3  4  5  6\n",
       "0  1  2  3  4  5  6  7\n",
       "1  8  9  0  1  2  3  4\n",
       "2  6  7  8  9  4  2  4"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Some more function\n",
    "fh_pd = pd.read_csv(\"dataset/numbers.txt\", header=None, nrows=3)\n",
    "fh_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1, 2, 3, 4, 5, 6, 7],\n",
       "       [8, 9, 0, 1, 2, 3, 4],\n",
       "       [6, 7, 8, 9, 4, 2, 4]], dtype=int64)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Values only\n",
    "fh_pd.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1, 2, 3, 4, 5, 6, 7],\n",
       "       [8, 9, 0, 1, 2, 3, 4],\n",
       "       [6, 7, 8, 9, 4, 2, 4]], dtype=int64)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# creating numpy array\n",
    "narr = np.array(fh_pd.values)\n",
    "narr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>\n"
     ]
    }
   ],
   "source": [
    "print(type(fh_pd), type(fh_pd.values), type(narr))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#pd.read_csv(file, sep=\"\\t\", comment=\"#\", na_values=[\"Nothing\"])\n",
    "# comment - ignore the data if it start with #\n",
    "# na_values - convert Nothing to NA or NaN (python value for missing data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "############################################################\n",
    "## Atul Singh  | www.datagenx.net | lnked.in/atulsingh\n",
    "############################################################"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}