{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.20.3\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "\n",
    "print pd.__version__"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Removing duplicate rows (Vid-26)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zipcode</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>24</td>\n",
       "      <td>M</td>\n",
       "      <td>technician</td>\n",
       "      <td>85711</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>53</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>94043</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>23</td>\n",
       "      <td>M</td>\n",
       "      <td>writer</td>\n",
       "      <td>32067</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>24</td>\n",
       "      <td>M</td>\n",
       "      <td>technician</td>\n",
       "      <td>43537</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>33</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>15213</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         age gender  occupation zipcode\n",
       "user_id                                \n",
       "1         24      M  technician   85711\n",
       "2         53      F       other   94043\n",
       "3         23      M      writer   32067\n",
       "4         24      M  technician   43537\n",
       "5         33      F       other   15213"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_table(\n",
    "                     \"http://bit.ly/movieusers\",\n",
    "                     sep=\"|\",\n",
    "                     header=None,\n",
    "                     names=[\"user_id\", \"age\", \"gender\", \"occupation\", \"zipcode\"],\n",
    "                     index_col=\"user_id\"\n",
    "                  )\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(943, 4)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# number of rows that are duplicate\n",
    "df.duplicated().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zipcode</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>496</th>\n",
       "      <td>21</td>\n",
       "      <td>F</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>572</th>\n",
       "      <td>51</td>\n",
       "      <td>M</td>\n",
       "      <td>educator</td>\n",
       "      <td>20003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>621</th>\n",
       "      <td>17</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>60402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684</th>\n",
       "      <td>28</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>733</th>\n",
       "      <td>44</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>60630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>805</th>\n",
       "      <td>27</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>20009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>32</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>97301</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         age gender occupation zipcode\n",
       "user_id                               \n",
       "496       21      F    student   55414\n",
       "572       51      M   educator   20003\n",
       "621       17      M    student   60402\n",
       "684       28      M    student   55414\n",
       "733       44      F      other   60630\n",
       "805       27      F      other   20009\n",
       "890       32      M    student   97301"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# we are trying to location of duplicated elements\n",
    "df.loc[df.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>gender</th>\n",
       "      <th>occupation</th>\n",
       "      <th>zipcode</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>67</th>\n",
       "      <td>17</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>60402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85</th>\n",
       "      <td>51</td>\n",
       "      <td>M</td>\n",
       "      <td>educator</td>\n",
       "      <td>20003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198</th>\n",
       "      <td>21</td>\n",
       "      <td>F</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>350</th>\n",
       "      <td>32</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>97301</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>428</th>\n",
       "      <td>28</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>437</th>\n",
       "      <td>27</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>20009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>460</th>\n",
       "      <td>44</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>60630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>496</th>\n",
       "      <td>21</td>\n",
       "      <td>F</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>572</th>\n",
       "      <td>51</td>\n",
       "      <td>M</td>\n",
       "      <td>educator</td>\n",
       "      <td>20003</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>621</th>\n",
       "      <td>17</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>60402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>684</th>\n",
       "      <td>28</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>55414</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>733</th>\n",
       "      <td>44</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>60630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>805</th>\n",
       "      <td>27</td>\n",
       "      <td>F</td>\n",
       "      <td>other</td>\n",
       "      <td>20009</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>32</td>\n",
       "      <td>M</td>\n",
       "      <td>student</td>\n",
       "      <td>97301</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         age gender occupation zipcode\n",
       "user_id                               \n",
       "67        17      M    student   60402\n",
       "85        51      M   educator   20003\n",
       "198       21      F    student   55414\n",
       "350       32      M    student   97301\n",
       "428       28      M    student   55414\n",
       "437       27      F      other   20009\n",
       "460       44      F      other   60630\n",
       "496       21      F    student   55414\n",
       "572       51      M   educator   20003\n",
       "621       17      M    student   60402\n",
       "684       28      M    student   55414\n",
       "733       44      F      other   60630\n",
       "805       27      F      other   20009\n",
       "890       32      M    student   97301"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# but if you notice, we can just locate one of the occurence that was duplicate of some other.\n",
    "# keep=False let's you see all the utterences\n",
    "df.loc[df.duplicated(keep=False)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "943-7 = 936\n"
     ]
    }
   ],
   "source": [
    "# we started with (943, 4) and now it is (936, 4)\n",
    "df.drop_duplicates().shape\n",
    "\n",
    "print \n",
    "print \"943-7 = 936\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Takeaways\n",
    "\n",
    "1. 'keep' paramater let's you see all the duplicated pairs.\n",
    "\n",
    "# -------------------------------"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Date & Time in Pandas (Vid-25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>6/1/1930 22:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>6/30/1930 20:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>2/15/1931 14:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Abilene</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>KS</td>\n",
       "      <td>6/1/1931 13:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>New York Worlds Fair</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>NY</td>\n",
       "      <td>4/18/1933 19:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   City Colors Reported Shape Reported State             Time\n",
       "0                Ithaca             NaN       TRIANGLE    NY   6/1/1930 22:00\n",
       "1           Willingboro             NaN          OTHER    NJ  6/30/1930 20:00\n",
       "2               Holyoke             NaN           OVAL    CO  2/15/1931 14:00\n",
       "3               Abilene             NaN           DISK    KS   6/1/1931 13:00\n",
       "4  New York Worlds Fair             NaN          LIGHT    NY  4/18/1933 19:00"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(\n",
    "                \"http://bit.ly/uforeports\"\n",
    "                    )\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "City               object\n",
       "Colors Reported    object\n",
       "Shape Reported     object\n",
       "State              object\n",
       "Time               object\n",
       "dtype: object"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# anything that has a dtype as object in pandas is stored as a string\n",
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>City</th>\n",
       "      <th>Colors Reported</th>\n",
       "      <th>Shape Reported</th>\n",
       "      <th>State</th>\n",
       "      <th>Time</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Ithaca</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TRIANGLE</td>\n",
       "      <td>NY</td>\n",
       "      <td>1930-06-01 22:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Willingboro</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OTHER</td>\n",
       "      <td>NJ</td>\n",
       "      <td>1930-06-30 20:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Holyoke</td>\n",
       "      <td>NaN</td>\n",
       "      <td>OVAL</td>\n",
       "      <td>CO</td>\n",
       "      <td>1931-02-15 14:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Abilene</td>\n",
       "      <td>NaN</td>\n",
       "      <td>DISK</td>\n",
       "      <td>KS</td>\n",
       "      <td>1931-06-01 13:00:00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>New York Worlds Fair</td>\n",
       "      <td>NaN</td>\n",
       "      <td>LIGHT</td>\n",
       "      <td>NY</td>\n",
       "      <td>1933-04-18 19:00:00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   City Colors Reported Shape Reported State  \\\n",
       "0                Ithaca             NaN       TRIANGLE    NY   \n",
       "1           Willingboro             NaN          OTHER    NJ   \n",
       "2               Holyoke             NaN           OVAL    CO   \n",
       "3               Abilene             NaN           DISK    KS   \n",
       "4  New York Worlds Fair             NaN          LIGHT    NY   \n",
       "\n",
       "                 Time  \n",
       "0 1930-06-01 22:00:00  \n",
       "1 1930-06-30 20:00:00  \n",
       "2 1931-02-15 14:00:00  \n",
       "3 1931-06-01 13:00:00  \n",
       "4 1933-04-18 19:00:00  "
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Time = pd.to_datetime(df.Time)\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "City                       object\n",
       "Colors Reported            object\n",
       "Shape Reported             object\n",
       "State                      object\n",
       "Time               datetime64[ns]\n",
       "dtype: object"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    1930-06-01\n",
       "1    1930-06-30\n",
       "Name: Time, dtype: object"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Time.dt.date.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    6\n",
       "1    6\n",
       "Name: Time, dtype: int64"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.Time.dt.month.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Takeaways\n",
    "\n",
    "1. Read the docs for many other attributes that exist for pandas datetime."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}