{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Imports" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.20.3\n" ] } ], "source": [ "import pandas as pd\n", "import os\n", "\n", "print pd.__version__" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Removing duplicate rows (Vid-26)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agegenderoccupationzipcode
user_id
124Mtechnician85711
253Fother94043
323Mwriter32067
424Mtechnician43537
533Fother15213
\n", "
" ], "text/plain": [ " age gender occupation zipcode\n", "user_id \n", "1 24 M technician 85711\n", "2 53 F other 94043\n", "3 23 M writer 32067\n", "4 24 M technician 43537\n", "5 33 F other 15213" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_table(\n", " \"http://bit.ly/movieusers\",\n", " sep=\"|\",\n", " header=None,\n", " names=[\"user_id\", \"age\", \"gender\", \"occupation\", \"zipcode\"],\n", " index_col=\"user_id\"\n", " )\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(943, 4)" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "7" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# number of rows that are duplicate\n", "df.duplicated().sum()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agegenderoccupationzipcode
user_id
49621Fstudent55414
57251Meducator20003
62117Mstudent60402
68428Mstudent55414
73344Fother60630
80527Fother20009
89032Mstudent97301
\n", "
" ], "text/plain": [ " age gender occupation zipcode\n", "user_id \n", "496 21 F student 55414\n", "572 51 M educator 20003\n", "621 17 M student 60402\n", "684 28 M student 55414\n", "733 44 F other 60630\n", "805 27 F other 20009\n", "890 32 M student 97301" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we are trying to location of duplicated elements\n", "df.loc[df.duplicated()]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agegenderoccupationzipcode
user_id
6717Mstudent60402
8551Meducator20003
19821Fstudent55414
35032Mstudent97301
42828Mstudent55414
43727Fother20009
46044Fother60630
49621Fstudent55414
57251Meducator20003
62117Mstudent60402
68428Mstudent55414
73344Fother60630
80527Fother20009
89032Mstudent97301
\n", "
" ], "text/plain": [ " age gender occupation zipcode\n", "user_id \n", "67 17 M student 60402\n", "85 51 M educator 20003\n", "198 21 F student 55414\n", "350 32 M student 97301\n", "428 28 M student 55414\n", "437 27 F other 20009\n", "460 44 F other 60630\n", "496 21 F student 55414\n", "572 51 M educator 20003\n", "621 17 M student 60402\n", "684 28 M student 55414\n", "733 44 F other 60630\n", "805 27 F other 20009\n", "890 32 M student 97301" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# but if you notice, we can just locate one of the occurence that was duplicate of some other.\n", "# keep=False let's you see all the utterences\n", "df.loc[df.duplicated(keep=False)]" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "943-7 = 936\n" ] } ], "source": [ "# we started with (943, 4) and now it is (936, 4)\n", "df.drop_duplicates().shape\n", "\n", "print \n", "print \"943-7 = 936\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Takeaways\n", "\n", "1. 'keep' paramater let's you see all the duplicated pairs.\n", "\n", "# -------------------------------" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Date & Time in Pandas (Vid-25)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedStateTime
0IthacaNaNTRIANGLENY6/1/1930 22:00
1WillingboroNaNOTHERNJ6/30/1930 20:00
2HolyokeNaNOVALCO2/15/1931 14:00
3AbileneNaNDISKKS6/1/1931 13:00
4New York Worlds FairNaNLIGHTNY4/18/1933 19:00
\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State Time\n", "0 Ithaca NaN TRIANGLE NY 6/1/1930 22:00\n", "1 Willingboro NaN OTHER NJ 6/30/1930 20:00\n", "2 Holyoke NaN OVAL CO 2/15/1931 14:00\n", "3 Abilene NaN DISK KS 6/1/1931 13:00\n", "4 New York Worlds Fair NaN LIGHT NY 4/18/1933 19:00" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(\n", " \"http://bit.ly/uforeports\"\n", " )\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "City object\n", "Colors Reported object\n", "Shape Reported object\n", "State object\n", "Time object\n", "dtype: object" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# anything that has a dtype as object in pandas is stored as a string\n", "df.dtypes" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CityColors ReportedShape ReportedStateTime
0IthacaNaNTRIANGLENY1930-06-01 22:00:00
1WillingboroNaNOTHERNJ1930-06-30 20:00:00
2HolyokeNaNOVALCO1931-02-15 14:00:00
3AbileneNaNDISKKS1931-06-01 13:00:00
4New York Worlds FairNaNLIGHTNY1933-04-18 19:00:00
\n", "
" ], "text/plain": [ " City Colors Reported Shape Reported State \\\n", "0 Ithaca NaN TRIANGLE NY \n", "1 Willingboro NaN OTHER NJ \n", "2 Holyoke NaN OVAL CO \n", "3 Abilene NaN DISK KS \n", "4 New York Worlds Fair NaN LIGHT NY \n", "\n", " Time \n", "0 1930-06-01 22:00:00 \n", "1 1930-06-30 20:00:00 \n", "2 1931-02-15 14:00:00 \n", "3 1931-06-01 13:00:00 \n", "4 1933-04-18 19:00:00 " ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.Time = pd.to_datetime(df.Time)\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "City object\n", "Colors Reported object\n", "Shape Reported object\n", "State object\n", "Time datetime64[ns]\n", "dtype: object" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dtypes" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 1930-06-01\n", "1 1930-06-30\n", "Name: Time, dtype: object" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.Time.dt.date.head(2)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 6\n", "1 6\n", "Name: Time, dtype: int64" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.Time.dt.month.head(2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Takeaways\n", "\n", "1. Read the docs for many other attributes that exist for pandas datetime." ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }