{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Imports" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.20.3\n" ] } ], "source": [ "import pandas as pd\n", "import os\n", "\n", "print pd.__version__" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Removing duplicate rows (Vid-26)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>age</th>\n", " <th>gender</th>\n", " <th>occupation</th>\n", " <th>zipcode</th>\n", " </tr>\n", " <tr>\n", " <th>user_id</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>1</th>\n", " <td>24</td>\n", " <td>M</td>\n", " <td>technician</td>\n", " <td>85711</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>53</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>94043</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>23</td>\n", " <td>M</td>\n", " <td>writer</td>\n", " <td>32067</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>24</td>\n", " <td>M</td>\n", " <td>technician</td>\n", " <td>43537</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>33</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>15213</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " age gender occupation zipcode\n", "user_id \n", "1 24 M technician 85711\n", "2 53 F other 94043\n", "3 23 M writer 32067\n", "4 24 M technician 43537\n", "5 33 F other 15213" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_table(\n", " \"http://bit.ly/movieusers\",\n", " sep=\"|\",\n", " header=None,\n", " names=[\"user_id\", \"age\", \"gender\", \"occupation\", \"zipcode\"],\n", " index_col=\"user_id\"\n", " )\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(943, 4)" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "7" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# number of rows that are duplicate\n", "df.duplicated().sum()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>age</th>\n", " <th>gender</th>\n", " <th>occupation</th>\n", " <th>zipcode</th>\n", " </tr>\n", " <tr>\n", " <th>user_id</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>496</th>\n", " <td>21</td>\n", " <td>F</td>\n", " <td>student</td>\n", " <td>55414</td>\n", " </tr>\n", " <tr>\n", " <th>572</th>\n", " <td>51</td>\n", " <td>M</td>\n", " <td>educator</td>\n", " <td>20003</td>\n", " </tr>\n", " <tr>\n", " <th>621</th>\n", " <td>17</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>60402</td>\n", " </tr>\n", " <tr>\n", " <th>684</th>\n", " <td>28</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>55414</td>\n", " </tr>\n", " <tr>\n", " <th>733</th>\n", " <td>44</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>60630</td>\n", " </tr>\n", " <tr>\n", " <th>805</th>\n", " <td>27</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>20009</td>\n", " </tr>\n", " <tr>\n", " <th>890</th>\n", " <td>32</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>97301</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " age gender occupation zipcode\n", "user_id \n", "496 21 F student 55414\n", "572 51 M educator 20003\n", "621 17 M student 60402\n", "684 28 M student 55414\n", "733 44 F other 60630\n", "805 27 F other 20009\n", "890 32 M student 97301" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we are trying to location of duplicated elements\n", "df.loc[df.duplicated()]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>age</th>\n", " <th>gender</th>\n", " <th>occupation</th>\n", " <th>zipcode</th>\n", " </tr>\n", " <tr>\n", " <th>user_id</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>67</th>\n", " <td>17</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>60402</td>\n", " </tr>\n", " <tr>\n", " <th>85</th>\n", " <td>51</td>\n", " <td>M</td>\n", " <td>educator</td>\n", " <td>20003</td>\n", " </tr>\n", " <tr>\n", " <th>198</th>\n", " <td>21</td>\n", " <td>F</td>\n", " <td>student</td>\n", " <td>55414</td>\n", " </tr>\n", " <tr>\n", " <th>350</th>\n", " <td>32</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>97301</td>\n", " </tr>\n", " <tr>\n", " <th>428</th>\n", " <td>28</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>55414</td>\n", " </tr>\n", " <tr>\n", " <th>437</th>\n", " <td>27</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>20009</td>\n", " </tr>\n", " <tr>\n", " <th>460</th>\n", " <td>44</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>60630</td>\n", " </tr>\n", " <tr>\n", " <th>496</th>\n", " <td>21</td>\n", " <td>F</td>\n", " <td>student</td>\n", " <td>55414</td>\n", " </tr>\n", " <tr>\n", " <th>572</th>\n", " <td>51</td>\n", " <td>M</td>\n", " <td>educator</td>\n", " <td>20003</td>\n", " </tr>\n", " <tr>\n", " <th>621</th>\n", " <td>17</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>60402</td>\n", " </tr>\n", " <tr>\n", " <th>684</th>\n", " <td>28</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>55414</td>\n", " </tr>\n", " <tr>\n", " <th>733</th>\n", " <td>44</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>60630</td>\n", " </tr>\n", " <tr>\n", " <th>805</th>\n", " <td>27</td>\n", " <td>F</td>\n", " <td>other</td>\n", " <td>20009</td>\n", " </tr>\n", " <tr>\n", " <th>890</th>\n", " <td>32</td>\n", " <td>M</td>\n", " <td>student</td>\n", " <td>97301</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " age gender occupation zipcode\n", "user_id \n", "67 17 M student 60402\n", "85 51 M educator 20003\n", "198 21 F student 55414\n", "350 32 M student 97301\n", "428 28 M student 55414\n", "437 27 F other 20009\n", "460 44 F other 60630\n", "496 21 F student 55414\n", "572 51 M educator 20003\n", "621 17 M student 60402\n", "684 28 M student 55414\n", "733 44 F other 60630\n", "805 27 F other 20009\n", "890 32 M student 97301" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# but if you notice, we can just locate one of the occurence that was duplicate of some other.\n", "# keep=False let's you see all the utterences\n", "df.loc[df.duplicated(keep=False)]" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "943-7 = 936\n" ] } ], "source": [ "# we started with (943, 4) and now it is (936, 4)\n", "df.drop_duplicates().shape\n", "\n", "print \n", "print \"943-7 = 936\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Takeaways\n", "\n", "1. 'keep' paramater let's you see all the duplicated pairs.\n", "\n", "# -------------------------------" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Date & Time in Pandas (Vid-25)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " <td>6/1/1930 22:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NaN</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " <td>6/30/1930 20:00</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>NaN</td>\n", " <td>OVAL</td>\n", " <td>CO</td>\n", " <td>2/15/1931 14:00</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Abilene</td>\n", " <td>NaN</td>\n", " <td>DISK</td>\n", " <td>KS</td>\n", " <td>6/1/1931 13:00</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>New York Worlds Fair</td>\n", " <td>NaN</td>\n", " <td>LIGHT</td>\n", " <td>NY</td>\n", " <td>4/18/1933 19:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00\n", "1 Willingboro NaN OTHER NJ 6/30/1930 20:00\n", "2 Holyoke NaN OVAL CO 2/15/1931 14:00\n", "3 Abilene NaN DISK KS 6/1/1931 13:00\n", "4 New York Worlds Fair NaN LIGHT NY 4/18/1933 19:00" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\n", " \"http://bit.ly/uforeports\"\n", " )\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "City object\n", "Colors Reported object\n", "Shape Reported object\n", "State object\n", "Time object\n", "dtype: object" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# anything that has a dtype as object in pandas is stored as a string\n", "df.dtypes" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>City</th>\n", " <th>Colors Reported</th>\n", " <th>Shape Reported</th>\n", " <th>State</th>\n", " <th>Time</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>Ithaca</td>\n", " <td>NaN</td>\n", " <td>TRIANGLE</td>\n", " <td>NY</td>\n", " <td>1930-06-01 22:00:00</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>Willingboro</td>\n", " <td>NaN</td>\n", " <td>OTHER</td>\n", " <td>NJ</td>\n", " <td>1930-06-30 20:00:00</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>Holyoke</td>\n", " <td>NaN</td>\n", " <td>OVAL</td>\n", " <td>CO</td>\n", " <td>1931-02-15 14:00:00</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>Abilene</td>\n", " <td>NaN</td>\n", " <td>DISK</td>\n", " <td>KS</td>\n", " <td>1931-06-01 13:00:00</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>New York Worlds Fair</td>\n", " <td>NaN</td>\n", " <td>LIGHT</td>\n", " <td>NY</td>\n", " <td>1933-04-18 19:00:00</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " City Colors Reported Shape Reported State \\\n", "0 Ithaca NaN TRIANGLE NY \n", "1 Willingboro NaN OTHER NJ \n", "2 Holyoke NaN OVAL CO \n", "3 Abilene NaN DISK KS \n", "4 New York Worlds Fair NaN LIGHT NY \n", "\n", " Time \n", "0 1930-06-01 22:00:00 \n", "1 1930-06-30 20:00:00 \n", "2 1931-02-15 14:00:00 \n", "3 1931-06-01 13:00:00 \n", "4 1933-04-18 19:00:00 " ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.Time = pd.to_datetime(df.Time)\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "City object\n", "Colors Reported object\n", "Shape Reported object\n", "State object\n", "Time datetime64[ns]\n", "dtype: object" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 1930-06-01\n", "1 1930-06-30\n", "Name: Time, dtype: object" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.Time.dt.date.head(2)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 6\n", "1 6\n", "Name: Time, dtype: int64" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.Time.dt.month.head(2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Takeaways\n", "\n", "1. Read the docs for many other attributes that exist for pandas datetime." ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }