{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.20.3\n" ] } ], "source": [ "import pandas as pd\n", "import os\n", "\n", "print pd.__version__" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Removing Columns/Rows (Vid-6)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>sepal_len</th>\n", " <th>sepal_wid</th>\n", " <th>petal_len</th>\n", " <th>petal_wid</th>\n", " <th>class</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>5.1</td>\n", " <td>3.5</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " <td>Iris-setosa</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>4.9</td>\n", " <td>3.0</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " <td>Iris-setosa</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>4.7</td>\n", " <td>3.2</td>\n", " <td>1.3</td>\n", " <td>0.2</td>\n", " <td>Iris-setosa</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4.6</td>\n", " <td>3.1</td>\n", " <td>1.5</td>\n", " <td>0.2</td>\n", " <td>Iris-setosa</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5.0</td>\n", " <td>3.6</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " <td>Iris-setosa</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " sepal_len sepal_wid petal_len petal_wid class\n", "0 5.1 3.5 1.4 0.2 Iris-setosa\n", "1 4.9 3.0 1.4 0.2 Iris-setosa\n", "2 4.7 3.2 1.3 0.2 Iris-setosa\n", "3 4.6 3.1 1.5 0.2 Iris-setosa\n", "4 5.0 3.6 1.4 0.2 Iris-setosa" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "DATA_DIR = '../data'\n", "# reading table\n", "# making seperator as comma\n", "# renaming column names for 0th row of the file\n", "df = pd.read_table(\n", " os.path.abspath(os.path.join(DATA_DIR,'day1/iris.csv')), \n", " sep=',',\n", " header=0,\n", " names=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']\n", " )\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(150, 5)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# see dimension of the dataset\n", "# 150 rows, 5 columns\n", "df.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Column Drop" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>sepal_len</th>\n", " <th>sepal_wid</th>\n", " <th>petal_len</th>\n", " <th>petal_wid</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>5.1</td>\n", " <td>3.5</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>4.9</td>\n", " <td>3.0</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>4.7</td>\n", " <td>3.2</td>\n", " <td>1.3</td>\n", " <td>0.2</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4.6</td>\n", " <td>3.1</td>\n", " <td>1.5</td>\n", " <td>0.2</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5.0</td>\n", " <td>3.6</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " sepal_len sepal_wid petal_len petal_wid\n", "0 5.1 3.5 1.4 0.2\n", "1 4.9 3.0 1.4 0.2\n", "2 4.7 3.2 1.3 0.2\n", "3 4.6 3.1 1.5 0.2\n", "4 5.0 3.6 1.4 0.2" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop method takes the column names in array\n", "# axis=1 corresponds to columns\n", "# inplace=True does not require you to hold it in other variable, memory efficient\n", "df.drop(['class'], axis=1, inplace=True)\n", "df.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Row Drop" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>sepal_len</th>\n", " <th>sepal_wid</th>\n", " <th>petal_len</th>\n", " <th>petal_wid</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>2</th>\n", " <td>4.7</td>\n", " <td>3.2</td>\n", " <td>1.3</td>\n", " <td>0.2</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4.6</td>\n", " <td>3.1</td>\n", " <td>1.5</td>\n", " <td>0.2</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5.0</td>\n", " <td>3.6</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>5.4</td>\n", " <td>3.9</td>\n", " <td>1.7</td>\n", " <td>0.4</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>4.6</td>\n", " <td>3.4</td>\n", " <td>1.4</td>\n", " <td>0.3</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " sepal_len sepal_wid petal_len petal_wid\n", "2 4.7 3.2 1.3 0.2\n", "3 4.6 3.1 1.5 0.2\n", "4 5.0 3.6 1.4 0.2\n", "5 5.4 3.9 1.7 0.4\n", "6 4.6 3.4 1.4 0.3" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop method takes the row names in array\n", "# axis=0 corresponds to rows, bydefault axis=0 in drop method\n", "# inplace=True does not require you to hold it in other variable, memory efficient\n", "df.drop([0, 1], axis=0, inplace=True)\n", "df.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Takeaways\n", "\n", "1. Keep in practice to always specify 'axis' parameter in drop method or other necessary methods for better understanding." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# -----------------------" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Sorting (Vid-7)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>9.3</td>\n", " <td>The Shawshank Redemption</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>142</td>\n", " <td>[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>9.2</td>\n", " <td>The Godfather</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>175</td>\n", " <td>[u'Marlon Brando', u'Al Pacino', u'James Caan']</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>9.1</td>\n", " <td>The Godfather: Part II</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>200</td>\n", " <td>[u'Al Pacino', u'Robert De Niro', u'Robert Duv...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>9.0</td>\n", " <td>The Dark Knight</td>\n", " <td>PG-13</td>\n", " <td>Action</td>\n", " <td>152</td>\n", " <td>[u'Christian Bale', u'Heath Ledger', u'Aaron E...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>8.9</td>\n", " <td>Pulp Fiction</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>154</td>\n", " <td>[u'John Travolta', u'Uma Thurman', u'Samuel L....</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title content_rating genre duration \\\n", "0 9.3 The Shawshank Redemption R Crime 142 \n", "1 9.2 The Godfather R Crime 175 \n", "2 9.1 The Godfather: Part II R Crime 200 \n", "3 9.0 The Dark Knight PG-13 Action 152 \n", "4 8.9 Pulp Fiction R Crime 154 \n", "\n", " actors_list \n", "0 [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... \n", "1 [u'Marlon Brando', u'Al Pacino', u'James Caan'] \n", "2 [u'Al Pacino', u'Robert De Niro', u'Robert Duv... \n", "3 [u'Christian Bale', u'Heath Ledger', u'Aaron E... \n", "4 [u'John Travolta', u'Uma Thurman', u'Samuel L.... " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_table(\n", " 'http://bit.ly/imdbratings', \n", " sep=','\n", " )\n", "df.head(5)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "978 7.4\n", "950 7.4\n", "949 7.4\n", "948 7.4\n", "947 7.4\n", "Name: star_rating, dtype: float64\n", "6 8.9\n", "3 9.0\n", "2 9.1\n", "1 9.2\n", "0 9.3\n", "Name: star_rating, dtype: float64\n" ] } ], "source": [ "# sort_values() method returns bydefault by ascending order\n", "# sort_values() can take 'inplace=True/False' for changing the values inplace\n", "print df['star_rating'].sort_values().head(5)\n", "print df['star_rating'].sort_values().tail(5)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 9.3\n", "1 9.2\n", "2 9.1\n", "3 9.0\n", "6 8.9\n", "Name: star_rating, dtype: float64\n", "947 7.4\n", "948 7.4\n", "949 7.4\n", "950 7.4\n", "978 7.4\n", "Name: star_rating, dtype: float64\n" ] } ], "source": [ "# ascending=True/False parameter in sort_values() can decide the sorting order\n", "print df['star_rating'].sort_values(ascending=False).head(5)\n", "print df['star_rating'].sort_values(ascending=False).tail(5)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>389</th>\n", " <td>8.0</td>\n", " <td>Freaks</td>\n", " <td>UNRATED</td>\n", " <td>Drama</td>\n", " <td>64</td>\n", " <td>[u'Wallace Ford', u'Leila Hyams', u'Olga Bacla...</td>\n", " </tr>\n", " <tr>\n", " <th>338</th>\n", " <td>8.0</td>\n", " <td>Battleship Potemkin</td>\n", " <td>UNRATED</td>\n", " <td>History</td>\n", " <td>66</td>\n", " <td>[u'Aleksandr Antonov', u'Vladimir Barsky', u'G...</td>\n", " </tr>\n", " <tr>\n", " <th>258</th>\n", " <td>8.1</td>\n", " <td>The Cabinet of Dr. Caligari</td>\n", " <td>UNRATED</td>\n", " <td>Crime</td>\n", " <td>67</td>\n", " <td>[u'Werner Krauss', u'Conrad Veidt', u'Friedric...</td>\n", " </tr>\n", " <tr>\n", " <th>293</th>\n", " <td>8.1</td>\n", " <td>Duck Soup</td>\n", " <td>PASSED</td>\n", " <td>Comedy</td>\n", " <td>68</td>\n", " <td>[u'Groucho Marx', u'Harpo Marx', u'Chico Marx']</td>\n", " </tr>\n", " <tr>\n", " <th>88</th>\n", " <td>8.4</td>\n", " <td>The Kid</td>\n", " <td>NOT RATED</td>\n", " <td>Comedy</td>\n", " <td>68</td>\n", " <td>[u'Charles Chaplin', u'Edna Purviance', u'Jack...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title content_rating genre \\\n", "389 8.0 Freaks UNRATED Drama \n", "338 8.0 Battleship Potemkin UNRATED History \n", "258 8.1 The Cabinet of Dr. Caligari UNRATED Crime \n", "293 8.1 Duck Soup PASSED Comedy \n", "88 8.4 The Kid NOT RATED Comedy \n", "\n", " duration actors_list \n", "389 64 [u'Wallace Ford', u'Leila Hyams', u'Olga Bacla... \n", "338 66 [u'Aleksandr Antonov', u'Vladimir Barsky', u'G... \n", "258 67 [u'Werner Krauss', u'Conrad Veidt', u'Friedric... \n", "293 68 [u'Groucho Marx', u'Harpo Marx', u'Chico Marx'] \n", "88 68 [u'Charles Chaplin', u'Edna Purviance', u'Jack... " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# relatively better way to do is to use the below mentioned technique\n", "# to sort by multiple fields, just populate the array inside sort_values()\n", "df.sort_values(['duration'], ascending=True).head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Takeaways\n", "\n", "1. Pandas dataframe is table having rows and columns.\n", "2. Pandas Series is just one column in the dataframe.\n", "3. sort_values() method returns bydefault by ascending order." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# -----------------------\n", "\n", "# Single Filter (Vid-8)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>9.3</td>\n", " <td>The Shawshank Redemption</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>142</td>\n", " <td>[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>9.2</td>\n", " <td>The Godfather</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>175</td>\n", " <td>[u'Marlon Brando', u'Al Pacino', u'James Caan']</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>9.1</td>\n", " <td>The Godfather: Part II</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>200</td>\n", " <td>[u'Al Pacino', u'Robert De Niro', u'Robert Duv...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>9.0</td>\n", " <td>The Dark Knight</td>\n", " <td>PG-13</td>\n", " <td>Action</td>\n", " <td>152</td>\n", " <td>[u'Christian Bale', u'Heath Ledger', u'Aaron E...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>8.9</td>\n", " <td>Pulp Fiction</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>154</td>\n", " <td>[u'John Travolta', u'Uma Thurman', u'Samuel L....</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title content_rating genre duration \\\n", "0 9.3 The Shawshank Redemption R Crime 142 \n", "1 9.2 The Godfather R Crime 175 \n", "2 9.1 The Godfather: Part II R Crime 200 \n", "3 9.0 The Dark Knight PG-13 Action 152 \n", "4 8.9 Pulp Fiction R Crime 154 \n", "\n", " actors_list \n", "0 [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... \n", "1 [u'Marlon Brando', u'Al Pacino', u'James Caan'] \n", "2 [u'Al Pacino', u'Robert De Niro', u'Robert Duv... \n", "3 [u'Christian Bale', u'Heath Ledger', u'Aaron E... \n", "4 [u'John Travolta', u'Uma Thurman', u'Samuel L.... " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_table(\n", " 'http://bit.ly/imdbratings', \n", " sep=','\n", " )\n", "df.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Experiment 1" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>35</th>\n", " <td>8.6</td>\n", " <td>Modern Times</td>\n", " <td>G</td>\n", " <td>Comedy</td>\n", " <td>87</td>\n", " <td>[u'Charles Chaplin', u'Paulette Goddard', u'He...</td>\n", " </tr>\n", " <tr>\n", " <th>36</th>\n", " <td>8.6</td>\n", " <td>Saving Private Ryan</td>\n", " <td>R</td>\n", " <td>Action</td>\n", " <td>169</td>\n", " <td>[u'Tom Hanks', u'Matt Damon', u'Tom Sizemore']</td>\n", " </tr>\n", " <tr>\n", " <th>37</th>\n", " <td>8.6</td>\n", " <td>Raiders of the Lost Ark</td>\n", " <td>PG</td>\n", " <td>Action</td>\n", " <td>115</td>\n", " <td>[u'Harrison Ford', u'Karen Allen', u'Paul Free...</td>\n", " </tr>\n", " <tr>\n", " <th>38</th>\n", " <td>8.6</td>\n", " <td>Rear Window</td>\n", " <td>APPROVED</td>\n", " <td>Mystery</td>\n", " <td>112</td>\n", " <td>[u'James Stewart', u'Grace Kelly', u'Wendell C...</td>\n", " </tr>\n", " <tr>\n", " <th>39</th>\n", " <td>8.6</td>\n", " <td>Psycho</td>\n", " <td>R</td>\n", " <td>Horror</td>\n", " <td>109</td>\n", " <td>[u'Anthony Perkins', u'Janet Leigh', u'Vera Mi...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title content_rating genre duration \\\n", "35 8.6 Modern Times G Comedy 87 \n", "36 8.6 Saving Private Ryan R Action 169 \n", "37 8.6 Raiders of the Lost Ark PG Action 115 \n", "38 8.6 Rear Window APPROVED Mystery 112 \n", "39 8.6 Psycho R Horror 109 \n", "\n", " actors_list \n", "35 [u'Charles Chaplin', u'Paulette Goddard', u'He... \n", "36 [u'Tom Hanks', u'Matt Damon', u'Tom Sizemore'] \n", "37 [u'Harrison Ford', u'Karen Allen', u'Paul Free... \n", "38 [u'James Stewart', u'Grace Kelly', u'Wendell C... \n", "39 [u'Anthony Perkins', u'Janet Leigh', u'Vera Mi... " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we need movies above 8.5\n", "df_rating_bools = df['star_rating'].map(lambda row: row>8.5)\n", "df[df_rating_bools].tail(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Experiment 2" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>35</th>\n", " <td>8.6</td>\n", " <td>Modern Times</td>\n", " <td>G</td>\n", " <td>Comedy</td>\n", " <td>87</td>\n", " <td>[u'Charles Chaplin', u'Paulette Goddard', u'He...</td>\n", " </tr>\n", " <tr>\n", " <th>36</th>\n", " <td>8.6</td>\n", " <td>Saving Private Ryan</td>\n", " <td>R</td>\n", " <td>Action</td>\n", " <td>169</td>\n", " <td>[u'Tom Hanks', u'Matt Damon', u'Tom Sizemore']</td>\n", " </tr>\n", " <tr>\n", " <th>37</th>\n", " <td>8.6</td>\n", " <td>Raiders of the Lost Ark</td>\n", " <td>PG</td>\n", " <td>Action</td>\n", " <td>115</td>\n", " <td>[u'Harrison Ford', u'Karen Allen', u'Paul Free...</td>\n", " </tr>\n", " <tr>\n", " <th>38</th>\n", " <td>8.6</td>\n", " <td>Rear Window</td>\n", " <td>APPROVED</td>\n", " <td>Mystery</td>\n", " <td>112</td>\n", " <td>[u'James Stewart', u'Grace Kelly', u'Wendell C...</td>\n", " </tr>\n", " <tr>\n", " <th>39</th>\n", " <td>8.6</td>\n", " <td>Psycho</td>\n", " <td>R</td>\n", " <td>Horror</td>\n", " <td>109</td>\n", " <td>[u'Anthony Perkins', u'Janet Leigh', u'Vera Mi...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title content_rating genre duration \\\n", "35 8.6 Modern Times G Comedy 87 \n", "36 8.6 Saving Private Ryan R Action 169 \n", "37 8.6 Raiders of the Lost Ark PG Action 115 \n", "38 8.6 Rear Window APPROVED Mystery 112 \n", "39 8.6 Psycho R Horror 109 \n", "\n", " actors_list \n", "35 [u'Charles Chaplin', u'Paulette Goddard', u'He... \n", "36 [u'Tom Hanks', u'Matt Damon', u'Tom Sizemore'] \n", "37 [u'Harrison Ford', u'Karen Allen', u'Paul Free... \n", "38 [u'James Stewart', u'Grace Kelly', u'Wendell C... \n", "39 [u'Anthony Perkins', u'Janet Leigh', u'Vera Mi... " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we need movies above 8.5\n", "boolean = list()\n", "for row in df['star_rating']:\n", " if row > 8.5: boolean.append(True)\n", " else: boolean.append(False)\n", "\n", "# boolean is a list, and since column in pandas is a series, so we need to convert list to series\n", "df_rating_bools = pd.Series(boolean)\n", "df[df_rating_bools].tail(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Experiment 3" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>35</th>\n", " <td>8.6</td>\n", " <td>Modern Times</td>\n", " <td>G</td>\n", " <td>Comedy</td>\n", " <td>87</td>\n", " <td>[u'Charles Chaplin', u'Paulette Goddard', u'He...</td>\n", " </tr>\n", " <tr>\n", " <th>36</th>\n", " <td>8.6</td>\n", " <td>Saving Private Ryan</td>\n", " <td>R</td>\n", " <td>Action</td>\n", " <td>169</td>\n", " <td>[u'Tom Hanks', u'Matt Damon', u'Tom Sizemore']</td>\n", " </tr>\n", " <tr>\n", " <th>37</th>\n", " <td>8.6</td>\n", " <td>Raiders of the Lost Ark</td>\n", " <td>PG</td>\n", " <td>Action</td>\n", " <td>115</td>\n", " <td>[u'Harrison Ford', u'Karen Allen', u'Paul Free...</td>\n", " </tr>\n", " <tr>\n", " <th>38</th>\n", " <td>8.6</td>\n", " <td>Rear Window</td>\n", " <td>APPROVED</td>\n", " <td>Mystery</td>\n", " <td>112</td>\n", " <td>[u'James Stewart', u'Grace Kelly', u'Wendell C...</td>\n", " </tr>\n", " <tr>\n", " <th>39</th>\n", " <td>8.6</td>\n", " <td>Psycho</td>\n", " <td>R</td>\n", " <td>Horror</td>\n", " <td>109</td>\n", " <td>[u'Anthony Perkins', u'Janet Leigh', u'Vera Mi...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title content_rating genre duration \\\n", "35 8.6 Modern Times G Comedy 87 \n", "36 8.6 Saving Private Ryan R Action 169 \n", "37 8.6 Raiders of the Lost Ark PG Action 115 \n", "38 8.6 Rear Window APPROVED Mystery 112 \n", "39 8.6 Psycho R Horror 109 \n", "\n", " actors_list \n", "35 [u'Charles Chaplin', u'Paulette Goddard', u'He... \n", "36 [u'Tom Hanks', u'Matt Damon', u'Tom Sizemore'] \n", "37 [u'Harrison Ford', u'Karen Allen', u'Paul Free... \n", "38 [u'James Stewart', u'Grace Kelly', u'Wendell C... \n", "39 [u'Anthony Perkins', u'Janet Leigh', u'Vera Mi... " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# df['star_rating'] > 8.5 automatically searches/iterates through all the rows satisying this condition\n", "df[df['star_rating'] > 8.5].tail(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Takeaways\n", "\n", "1. Try practicing, Experiment 3 while coding." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# -----------------------\n", "\n", "# Multiple Filter (Vid-9)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>9.3</td>\n", " <td>The Shawshank Redemption</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>142</td>\n", " <td>[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>9.2</td>\n", " <td>The Godfather</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>175</td>\n", " <td>[u'Marlon Brando', u'Al Pacino', u'James Caan']</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>9.1</td>\n", " <td>The Godfather: Part II</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>200</td>\n", " <td>[u'Al Pacino', u'Robert De Niro', u'Robert Duv...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>9.0</td>\n", " <td>The Dark Knight</td>\n", " <td>PG-13</td>\n", " <td>Action</td>\n", " <td>152</td>\n", " <td>[u'Christian Bale', u'Heath Ledger', u'Aaron E...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>8.9</td>\n", " <td>Pulp Fiction</td>\n", " <td>R</td>\n", " <td>Crime</td>\n", " <td>154</td>\n", " <td>[u'John Travolta', u'Uma Thurman', u'Samuel L....</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title content_rating genre duration \\\n", "0 9.3 The Shawshank Redemption R Crime 142 \n", "1 9.2 The Godfather R Crime 175 \n", "2 9.1 The Godfather: Part II R Crime 200 \n", "3 9.0 The Dark Knight PG-13 Action 152 \n", "4 8.9 Pulp Fiction R Crime 154 \n", "\n", " actors_list \n", "0 [u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt... \n", "1 [u'Marlon Brando', u'Al Pacino', u'James Caan'] \n", "2 [u'Al Pacino', u'Robert De Niro', u'Robert Duv... \n", "3 [u'Christian Bale', u'Heath Ledger', u'Aaron E... \n", "4 [u'John Travolta', u'Uma Thurman', u'Samuel L.... " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_table(\n", " 'http://bit.ly/imdbratings', \n", " sep=','\n", " )\n", "df.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Experiment 1" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>7</th>\n", " <td>8.9</td>\n", " <td>The Lord of the Rings: The Return of the King</td>\n", " <td>PG-13</td>\n", " <td>Adventure</td>\n", " <td>201</td>\n", " <td>[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK...</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>8.7</td>\n", " <td>Seven Samurai</td>\n", " <td>UNRATED</td>\n", " <td>Drama</td>\n", " <td>207</td>\n", " <td>[u'Toshir\\xf4 Mifune', u'Takashi Shimura', u'K...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title content_rating \\\n", "7 8.9 The Lord of the Rings: The Return of the King PG-13 \n", "17 8.7 Seven Samurai UNRATED \n", "\n", " genre duration actors_list \n", "7 Adventure 201 [u'Elijah Wood', u'Viggo Mortensen', u'Ian McK... \n", "17 Drama 207 [u'Toshir\\xf4 Mifune', u'Takashi Shimura', u'K... " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we want movies that have rating above 8.5 and duration above 200mins\n", "df[(df['star_rating'] > 8.5) & (df['duration'] > 200)].head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Experiment 2" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>7</th>\n", " <td>8.9</td>\n", " <td>The Lord of the Rings: The Return of the King</td>\n", " <td>PG-13</td>\n", " <td>Adventure</td>\n", " <td>201</td>\n", " <td>[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK...</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>8.7</td>\n", " <td>Seven Samurai</td>\n", " <td>UNRATED</td>\n", " <td>Drama</td>\n", " <td>207</td>\n", " <td>[u'Toshir\\xf4 Mifune', u'Takashi Shimura', u'K...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title content_rating \\\n", "7 8.9 The Lord of the Rings: The Return of the King PG-13 \n", "17 8.7 Seven Samurai UNRATED \n", "\n", " genre duration actors_list \n", "7 Adventure 201 [u'Elijah Wood', u'Viggo Mortensen', u'Ian McK... \n", "17 Drama 207 [u'Toshir\\xf4 Mifune', u'Takashi Shimura', u'K... " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_rating_bools = df['star_rating'].map(lambda row: row>8.5)\n", "df_duration_bools = df['duration'].map(lambda row: row>200)\n", "df[df_rating_bools & df_duration_bools].tail(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Experiment 3" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style>\n", " .dataframe thead tr:only-child th {\n", " text-align: right;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: left;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>star_rating</th>\n", " <th>title</th>\n", " <th>content_rating</th>\n", " <th>genre</th>\n", " <th>duration</th>\n", " <th>actors_list</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>3</th>\n", " <td>9.0</td>\n", " <td>The Dark Knight</td>\n", " <td>PG-13</td>\n", " <td>Action</td>\n", " <td>152</td>\n", " <td>[u'Christian Bale', u'Heath Ledger', u'Aaron E...</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>8.9</td>\n", " <td>12 Angry Men</td>\n", " <td>NOT RATED</td>\n", " <td>Drama</td>\n", " <td>96</td>\n", " <td>[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals...</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>8.9</td>\n", " <td>Fight Club</td>\n", " <td>R</td>\n", " <td>Drama</td>\n", " <td>139</td>\n", " <td>[u'Brad Pitt', u'Edward Norton', u'Helena Bonh...</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>8.8</td>\n", " <td>Inception</td>\n", " <td>PG-13</td>\n", " <td>Action</td>\n", " <td>148</td>\n", " <td>[u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'...</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>8.8</td>\n", " <td>Star Wars: Episode V - The Empire Strikes Back</td>\n", " <td>PG</td>\n", " <td>Action</td>\n", " <td>124</td>\n", " <td>[u'Mark Hamill', u'Harrison Ford', u'Carrie Fi...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " star_rating title \\\n", "3 9.0 The Dark Knight \n", "5 8.9 12 Angry Men \n", "9 8.9 Fight Club \n", "11 8.8 Inception \n", "12 8.8 Star Wars: Episode V - The Empire Strikes Back \n", "\n", " content_rating genre duration \\\n", "3 PG-13 Action 152 \n", "5 NOT RATED Drama 96 \n", "9 R Drama 139 \n", "11 PG-13 Action 148 \n", "12 PG Action 124 \n", "\n", " actors_list \n", "3 [u'Christian Bale', u'Heath Ledger', u'Aaron E... \n", "5 [u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals... \n", "9 [u'Brad Pitt', u'Edward Norton', u'Helena Bonh... \n", "11 [u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'... \n", "12 [u'Mark Hamill', u'Harrison Ford', u'Carrie Fi... " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# this appoarch is inspired by python 'if in [1,2]' functionality\n", "bools = df['genre'].isin(['Drama', 'Action'])\n", "df[bools].head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Takeaways\n", "\n", "1. Use & when putting and filter\n", "2. Use | when putting or filter\n", "3. Remember to put parenthesis as shown in [Vid-9 Experiment 1], it helps pandas to set priority to the evaluations\n", "4. When in situation to use multiple | conditions, try using Vid-9 Experiement 3" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.12" } }, "nbformat": 4, "nbformat_minor": 1 }