{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Data Transformation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1. Removing Duplicates" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
k1k2
0one1
1one1
2one2
3two3
4two3
5two4
6two4
\n", "
" ], "text/plain": [ " k1 k2\n", "0 one 1\n", "1 one 1\n", "2 one 2\n", "3 two 3\n", "4 two 3\n", "5 two 4\n", "6 two 4" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.DataFrame({'k1': ['one'] * 3 + ['two'] * 4,\n", " 'k2': [1, 1, 2, 3, 3, 4, 4]})\n", "data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 False\n", "1 True\n", "2 False\n", "3 False\n", "4 True\n", "5 False\n", "6 True\n", "dtype: bool" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# The DataFrame method duplicated returns a boolean Series indicating whether each\n", "# row is a duplicate or not\n", "\n", "data.duplicated()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
k1k2
0one1
2one2
3two3
5two4
\n", "
" ], "text/plain": [ " k1 k2\n", "0 one 1\n", "2 one 2\n", "3 two 3\n", "5 two 4" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# drop_duplicates drop the duplicates\n", "data.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
k1k2
0one1
3two3
\n", "
" ], "text/plain": [ " k1 k2\n", "0 one 1\n", "3 two 3" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Both of these methods by default consider all of the columns; \n", "# until we defined a specific column name\n", "data.drop_duplicates(['k1'])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
k1k2
0one1
2one2
3two3
5two4
\n", "
" ], "text/plain": [ " k1 k2\n", "0 one 1\n", "2 one 2\n", "3 two 3\n", "5 two 4" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.drop_duplicates(['k2'])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\tools\\Anaconda3\\lib\\site-packages\\ipykernel\\__main__.py:4: FutureWarning: the take_last=True keyword is deprecated, use keep='last' instead\n" ] }, { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
k1k2
1one1
2one2
4two3
6two4
\n", "
" ], "text/plain": [ " k1 k2\n", "1 one 1\n", "2 one 2\n", "4 two 3\n", "6 two 4" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# duplicated and drop_duplicates by default keep the first observed value combination.\n", "# Passing take_last=True will return the last one:\n", "\n", "data.drop_duplicates(['k1', 'k2'], take_last=True)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
k1k2
1one1
2one2
4two3
6two4
\n", "
" ], "text/plain": [ " k1 k2\n", "1 one 1\n", "2 one 2\n", "4 two 3\n", "6 two 4" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.drop_duplicates(['k1', 'k2'], keep='last')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2. Transforming Data Using a Function or Mapping" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
foodounces
0bacon4.0
1pulled pork3.0
2bacon12.0
3Pastrami6.0
4corned beef7.5
5Bacon8.0
6pastrami3.0
7honey ham5.0
8nova lox6.0
\n", "
" ], "text/plain": [ " food ounces\n", "0 bacon 4.0\n", "1 pulled pork 3.0\n", "2 bacon 12.0\n", "3 Pastrami 6.0\n", "4 corned beef 7.5\n", "5 Bacon 8.0\n", "6 pastrami 3.0\n", "7 honey ham 5.0\n", "8 nova lox 6.0" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',\n", " 'corned beef', 'Bacon', 'pastrami', 'honey ham',\n", " 'nova lox'],\n", " 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})\n", "data" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'bacon': 'pig',\n", " 'corned beef': 'cow',\n", " 'honey ham': 'pig',\n", " 'nova lox': 'salmon',\n", " 'pastrami': 'cow',\n", " 'pulled pork': 'pig'}" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "meat_to_animal = {\n", " 'bacon': 'pig',\n", " 'pulled pork': 'pig',\n", " 'pastrami': 'cow',\n", " 'corned beef': 'cow',\n", " 'honey ham': 'pig',\n", " 'nova lox': 'salmon'\n", "}\n", "meat_to_animal" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
foodouncesanimal
0bacon4.0pig
1pulled pork3.0pig
2bacon12.0pig
3Pastrami6.0cow
4corned beef7.5cow
5Bacon8.0pig
6pastrami3.0cow
7honey ham5.0pig
8nova lox6.0salmon
\n", "
" ], "text/plain": [ " food ounces animal\n", "0 bacon 4.0 pig\n", "1 pulled pork 3.0 pig\n", "2 bacon 12.0 pig\n", "3 Pastrami 6.0 cow\n", "4 corned beef 7.5 cow\n", "5 Bacon 8.0 pig\n", "6 pastrami 3.0 cow\n", "7 honey ham 5.0 pig\n", "8 nova lox 6.0 salmon" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['animal'] = data['food'].map(str.lower).map(meat_to_animal)\n", "data" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 pig\n", "1 pig\n", "2 pig\n", "3 cow\n", "4 cow\n", "5 pig\n", "6 cow\n", "7 pig\n", "8 salmon\n", "Name: food, dtype: object" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['food'].map(lambda x: meat_to_animal[x.lower()])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Replacing Values" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 1.0\n", "1 -999.0\n", "2 2.0\n", "3 -999.0\n", "4 -1000.0\n", "5 3.0\n", "dtype: float64" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.Series([1., -999., 2., -999., -1000., 3.])\n", "data" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 1.0\n", "1 NaN\n", "2 2.0\n", "3 NaN\n", "4 -1000.0\n", "5 3.0\n", "dtype: float64" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# The -999 values might be sentinel values for missing data. To replace these with NA\n", "# values that pandas understands, we can use replace\n", "\n", "data.replace(-999, np.nan)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 1.0\n", "1 NaN\n", "2 2.0\n", "3 NaN\n", "4 NaN\n", "5 3.0\n", "dtype: float64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# replace multiple values at once\n", "\n", "data.replace([-999,-1000], np.nan)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 1.0\n", "1 NaN\n", "2 2.0\n", "3 NaN\n", "4 0.0\n", "5 3.0\n", "dtype: float64" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# To use a different replacement for each value, pass a list of substitutes:\n", "data.replace([-999,-1000], [np.nan, 0])" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "0 1.0\n", "1 NaN\n", "2 2.0\n", "3 NaN\n", "4 0.0\n", "5 3.0\n", "dtype: float64" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# The argument passed can also be a dict:\n", "data.replace({-999: np.nan, -1000: 0})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Renaming Axis Indexes" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
Ohio0123
Colorado4567
New York891011
\n", "
" ], "text/plain": [ " one two three four\n", "Ohio 0 1 2 3\n", "Colorado 4 5 6 7\n", "New York 8 9 10 11" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.DataFrame(np.arange(12).reshape((3, 4)),\n", " index=['Ohio', 'Colorado', 'New York'],\n", " columns=['one', 'two', 'three', 'four'])\n", "data" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array(['OHIO', 'COLORADO', 'NEW YORK'], dtype=object)" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.index.map(str.upper)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
OHIO0123
COLORADO4567
NEW YORK891011
\n", "
" ], "text/plain": [ " one two three four\n", "OHIO 0 1 2 3\n", "COLORADO 4 5 6 7\n", "NEW YORK 8 9 10 11" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.index = data.index.map(str.upper)\n", "data" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ONETWOTHREEFOUR
Ohio0123
Colorado4567
New York891011
\n", "
" ], "text/plain": [ " ONE TWO THREE FOUR\n", "Ohio 0 1 2 3\n", "Colorado 4 5 6 7\n", "New York 8 9 10 11" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# o create a transformed version of a data set without modifying the original,\n", "# a useful method is rename:\n", "data.rename(index=str.title, columns=str.upper)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwopeekaboofour
INDIANA0123
COLORADO4567
NEW YORK891011
\n", "
" ], "text/plain": [ " one two peekaboo four\n", "INDIANA 0 1 2 3\n", "COLORADO 4 5 6 7\n", "NEW YORK 8 9 10 11" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# rename can be used in conjunction with a dict-like object providing new values\n", "# for a subset of the axis labels\n", "\n", "data.rename(index={'OHIO': 'INDIANA'},\n", " columns={'three': 'peekaboo'})" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
INDIANA0123
COLORADO4567
NEW YORK891011
\n", "
" ], "text/plain": [ " one two three four\n", "INDIANA 0 1 2 3\n", "COLORADO 4 5 6 7\n", "NEW YORK 8 9 10 11" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# rename saves having to copy the DataFrame manually and assign to its index and columns \n", "# attributes. Should you wish to modify a data set in place, pass inplace=True\n", "\n", "_ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True)\n", "data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Discretization and Binning" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Continuous data is often discretized or otherwised separated into “bins” for analysis.\n", "\n", "ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]\n", "ages" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[18, 25, 35, 60, 100]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bins = [18, 25, 35, 60, 100]\n", "bins" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]\n", "Length: 12\n", "Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cats = pd.cut(ages, bins)\n", "cats" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\tools\\Anaconda3\\lib\\site-packages\\ipykernel\\__main__.py:1: FutureWarning: 'labels' is deprecated. Use 'codes' instead\n", " if __name__ == '__main__':\n" ] }, { "data": { "text/plain": [ "array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cats.labels" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cats.codes" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[18, 18, 18, 25, 18, 18, 35, 25, 60, 35, 35, 25]" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[bins[i] for i in cats.codes]" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\tools\\Anaconda3\\lib\\site-packages\\ipykernel\\__main__.py:1: FutureWarning: Accessing 'levels' is deprecated, use 'categories'\n", " if __name__ == '__main__':\n" ] }, { "data": { "text/plain": [ "Index(['(18, 25]', '(25, 35]', '(35, 60]', '(60, 100]'], dtype='object')" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cats.levels" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Index(['(18, 25]', '(25, 35]', '(35, 60]', '(60, 100]'], dtype='object')" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cats.categories" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(18, 25] 5\n", "(35, 60] 3\n", "(25, 35] 3\n", "(60, 100] 1\n", "dtype: int64" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.value_counts(cats)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]\n", "Length: 12\n", "Categories (4, object): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Consistent with mathematical notation for intervals, a parenthesis means that the side\n", "# is open while the square bracket means it is closed (inclusive). Which side is closed can\n", "# be changed by passing right=False:\n", "pd.cut(ages, [18, 26, 36, 61, 100], right=False)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]\n", "Length: 12\n", "Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# You can also pass your own bin names by passing a list or array to the labels option:\n", "group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']\n", "pd.cut(ages, bins, labels=group_names)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([ 0.30040941, 0.42335852, 0.73095696, 0.74586086, 0.99197458,\n", " 0.10334217, 0.53164815, 0.63212415, 0.29426088, 0.6753901 ,\n", " 0.32015222, 0.26670971, 0.32528884, 0.51031056, 0.33031443,\n", " 0.59880679, 0.69966579, 0.32159514, 0.71322471, 0.82226222])" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# If you pass cut a integer number of bins instead of explicit bin edges, it will compute\n", "# equal-length bins based on the minimum and maximum values in the data. \n", "data = np.random.rand(20)\n", "data" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(0.1, 0.33], (0.33, 0.55], (0.55, 0.77], (0.55, 0.77], (0.77, 0.99], ..., (0.55, 0.77], (0.55, 0.77], (0.1, 0.33], (0.55, 0.77], (0.77, 0.99]]\n", "Length: 20\n", "Categories (4, object): [(0.1, 0.33] < (0.33, 0.55] < (0.55, 0.77] < (0.77, 0.99]]" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat=pd.cut(data, 4, precision=2)\n", "cat" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "Index(['(0.1, 0.33]', '(0.33, 0.55]', '(0.55, 0.77]', '(0.77, 0.99]'], dtype='object')" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat.categories" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([0, 1, 2, 2, 3, 0, 1, 2, 0, 2, 0, 0, 0, 1, 1, 2, 2, 0, 2, 3], dtype=int8)" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat.codes" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(0.55, 0.77] 7\n", "(0.1, 0.33] 7\n", "(0.33, 0.55] 4\n", "(0.77, 0.99] 2\n", "dtype: int64" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.value_counts(cat)" ] }, { "cell_type": "code", "execution_count": 62, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(0.23, 0.36], (0.36, 0.48], (0.61, 0.74], (0.74, 0.87], (0.87, 0.99], ..., (0.48, 0.61], (0.61, 0.74], (0.23, 0.36], (0.61, 0.74], (0.74, 0.87]]\n", "Length: 20\n", "Categories (7, object): [(0.1, 0.23] < (0.23, 0.36] < (0.36, 0.48] < (0.48, 0.61] < (0.61, 0.74] < (0.74, 0.87] < (0.87, 0.99]]" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cat=pd.cut(data, 7, precision=2)\n", "cat" ] }, { "cell_type": "code", "execution_count": 63, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(0.23, 0.36] 7\n", "(0.61, 0.74] 5\n", "(0.48, 0.61] 3\n", "(0.74, 0.87] 2\n", "(0.87, 0.99] 1\n", "(0.36, 0.48] 1\n", "(0.1, 0.23] 1\n", "dtype: int64" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.value_counts(cat)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(-0.605, 0.0514], (-0.605, 0.0514], [-3.296, -0.605], [-3.296, -0.605], [-3.296, -0.605], ..., (0.0514, 0.735], (0.735, 3.246], (0.0514, 0.735], (-0.605, 0.0514], (0.735, 3.246]]\n", "Length: 1000\n", "Categories (4, object): [[-3.296, -0.605] < (-0.605, 0.0514] < (0.0514, 0.735] < (0.735, 3.246]]" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# A closely related function, qcut, bins the data based on sample quantiles.\n", "data = np.random.randn(1000)\n", "\n", "cats = pd.qcut(data, 4) # Cut into quartiles\n", "cats" ] }, { "cell_type": "code", "execution_count": 66, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(0.735, 3.246] 250\n", "(0.0514, 0.735] 250\n", "(-0.605, 0.0514] 250\n", "[-3.296, -0.605] 250\n", "dtype: int64" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.value_counts(cats)" ] }, { "cell_type": "code", "execution_count": 68, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "[(-1.251, 0.0514], (-1.251, 0.0514], [-3.296, -1.251], [-3.296, -1.251], [-3.296, -1.251], ..., (0.0514, 1.281], (1.281, 3.246], (0.0514, 1.281], (-1.251, 0.0514], (1.281, 3.246]]\n", "Length: 1000\n", "Categories (4, object): [[-3.296, -1.251] < (-1.251, 0.0514] < (0.0514, 1.281] < (1.281, 3.246]]" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Similar to cut you can pass your own quantiles (numbers between 0 and 1, inclusive):\n", "cat=pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])\n", "cat" ] }, { "cell_type": "code", "execution_count": 69, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(0.0514, 1.281] 400\n", "(-1.251, 0.0514] 400\n", "(1.281, 3.246] 100\n", "[-3.296, -1.251] 100\n", "dtype: int64" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.value_counts(cat)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Detecting and Filtering Outliers" ] }, { "cell_type": "code", "execution_count": 71, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
count1000.0000001000.0000001000.0000001000.000000
mean-0.0676840.0679240.025598-0.002298
std0.9980350.9921061.0068350.996794
min-3.428254-3.548824-3.184377-3.745356
25%-0.774890-0.591841-0.641675-0.644144
50%-0.1164010.1011430.002073-0.013611
75%0.6163660.7802820.6803910.654328
max3.3666262.6536563.2603833.927528
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "count 1000.000000 1000.000000 1000.000000 1000.000000\n", "mean -0.067684 0.067924 0.025598 -0.002298\n", "std 0.998035 0.992106 1.006835 0.996794\n", "min -3.428254 -3.548824 -3.184377 -3.745356\n", "25% -0.774890 -0.591841 -0.641675 -0.644144\n", "50% -0.116401 0.101143 0.002073 -0.013611\n", "75% 0.616366 0.780282 0.680391 0.654328\n", "max 3.366626 2.653656 3.260383 3.927528" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Filtering or transforming outliers is largely a matter of applying array operations.\n", "\n", "np.random.seed(12345)\n", "\n", "data = pd.DataFrame(np.random.randn(1000, 4))\n", "data.describe()" ] }, { "cell_type": "code", "execution_count": 72, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "97 3.927528\n", "305 -3.399312\n", "400 -3.745356\n", "Name: 3, dtype: float64" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Suppose you wanted to find values in one of the columns exceeding three in magnitude:\n", "col = data[3]\n", "\n", "col[np.abs(col) > 3]" ] }, { "cell_type": "code", "execution_count": 73, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
5-0.5397410.4769853.248944-1.021228
97-0.7743630.5529360.1060613.927528
102-0.655054-0.5652303.1768730.959533
305-2.3155550.457246-0.025907-3.399312
3240.0501881.9513123.2603830.963301
4000.1463260.508391-0.196713-3.745356
499-0.293333-0.242459-3.0569901.918403
523-3.428254-0.296336-0.439938-0.867165
5860.2751441.179227-3.1843771.369891
808-0.362528-3.5488241.553205-2.186301
9003.366626-2.3722140.8510101.332846
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "5 -0.539741 0.476985 3.248944 -1.021228\n", "97 -0.774363 0.552936 0.106061 3.927528\n", "102 -0.655054 -0.565230 3.176873 0.959533\n", "305 -2.315555 0.457246 -0.025907 -3.399312\n", "324 0.050188 1.951312 3.260383 0.963301\n", "400 0.146326 0.508391 -0.196713 -3.745356\n", "499 -0.293333 -0.242459 -3.056990 1.918403\n", "523 -3.428254 -0.296336 -0.439938 -0.867165\n", "586 0.275144 1.179227 -3.184377 1.369891\n", "808 -0.362528 -3.548824 1.553205 -2.186301\n", "900 3.366626 -2.372214 0.851010 1.332846" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# To select all rows having a value exceeding 3 or -3, you can use the any method on a\n", "# boolean DataFrame:\n", "\n", " data[(np.abs(data) > 3).any(1)]" ] }, { "cell_type": "code", "execution_count": 77, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
count1000.0000001000.0000001000.0000001000.000000
mean-0.0676230.0684730.025153-0.002081
std0.9954850.9902531.0039770.989736
min-3.000000-3.000000-3.000000-3.000000
25%-0.774890-0.591841-0.641675-0.644144
50%-0.1164010.1011430.002073-0.013611
75%0.6163660.7802820.6803910.654328
max3.0000002.6536563.0000003.000000
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "count 1000.000000 1000.000000 1000.000000 1000.000000\n", "mean -0.067623 0.068473 0.025153 -0.002081\n", "std 0.995485 0.990253 1.003977 0.989736\n", "min -3.000000 -3.000000 -3.000000 -3.000000\n", "25% -0.774890 -0.591841 -0.641675 -0.644144\n", "50% -0.116401 0.101143 0.002073 -0.013611\n", "75% 0.616366 0.780282 0.680391 0.654328\n", "max 3.000000 2.653656 3.000000 3.000000" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# code to cap values outside the interval -3 to 3\n", "data[np.abs(data) > 3] = np.sign(data) * 3\n", "\n", "data.describe()" ] }, { "cell_type": "code", "execution_count": 78, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# data[np.abs(data) > 3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Permutation and Random Sampling" ] }, { "cell_type": "code", "execution_count": 79, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
00123
14567
2891011
312131415
416171819
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "0 0 1 2 3\n", "1 4 5 6 7\n", "2 8 9 10 11\n", "3 12 13 14 15\n", "4 16 17 18 19" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Permuting (randomly reordering) a Series or the rows in a DataFrame is easy to do using\n", "# the numpy.random.permutation function. Calling permutation with the length of the axis\n", "# you want to permute produces an array of integers indicating the new ordering:\n", "\n", "df = pd.DataFrame(np.arange(5 * 4).reshape(5, 4))\n", "df" ] }, { "cell_type": "code", "execution_count": 82, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([1, 3, 4, 0, 2])" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sampler = np.random.permutation(5)\n", "sampler" ] }, { "cell_type": "code", "execution_count": 83, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
14567
312131415
416171819
00123
2891011
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "1 4 5 6 7\n", "3 12 13 14 15\n", "4 16 17 18 19\n", "0 0 1 2 3\n", "2 8 9 10 11" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.take(sampler)" ] }, { "cell_type": "code", "execution_count": 84, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
14567
312131415
00123
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "1 4 5 6 7\n", "3 12 13 14 15\n", "0 0 1 2 3" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# To select a random subset without replacement, one way is to slice off the first k ele-\n", "# ments of the array returned by permutation, where k is the desired subset size. There\n", "# are much more efficient sampling-without-replacement algorithms, but this is an easy\n", "# strategy that uses readily available tools\n", "\n", "df.take(np.random.permutation(len(df))[:3])" ] }, { "cell_type": "code", "execution_count": 85, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([2, 2, 0, 3, 0, 4, 1, 1, 2, 3])" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# To generate a sample with replacement, the fastest way is to use np.random.randint to\n", "# draw random integers:\n", "bag = np.array([5, 7, -1, 6, 4])\n", "\n", "sampler = np.random.randint(0, len(bag), size=10)\n", "sampler" ] }, { "cell_type": "code", "execution_count": 86, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "array([-1, -1, 5, 6, 5, 4, 7, 7, -1, 6])" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "draws = bag.take(sampler)\n", "draws" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Computing Indicator/Dummy Variables" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Another type of transformation for statistical modeling or machine learning applica-\n", "tions is converting a categorical variable into a “dummy” or “indicator” matrix. If a\n", "column in a DataFrame has k distinct values, you would derive a matrix or DataFrame\n", "containing k columns containing all 1’s and 0’s. pandas has a get_dummies function for\n", "doing this, though devising one yourself is not difficult. Let’s return to an earlier ex-\n", "ample DataFrame" ] }, { "cell_type": "code", "execution_count": 87, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1key
00b
11b
22a
33c
44a
55b
\n", "
" ], "text/plain": [ " data1 key\n", "0 0 b\n", "1 1 b\n", "2 2 a\n", "3 3 c\n", "4 4 a\n", "5 5 b" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],\n", " 'data1': range(6)})\n", "df" ] }, { "cell_type": "code", "execution_count": 88, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abc
00.01.00.0
10.01.00.0
21.00.00.0
30.00.01.0
41.00.00.0
50.01.00.0
\n", "
" ], "text/plain": [ " a b c\n", "0 0.0 1.0 0.0\n", "1 0.0 1.0 0.0\n", "2 1.0 0.0 0.0\n", "3 0.0 0.0 1.0\n", "4 1.0 0.0 0.0\n", "5 0.0 1.0 0.0" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.get_dummies(df['key'])" ] }, { "cell_type": "code", "execution_count": 89, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
key_akey_bkey_c
00.01.00.0
10.01.00.0
21.00.00.0
30.00.01.0
41.00.00.0
50.01.00.0
\n", "
" ], "text/plain": [ " key_a key_b key_c\n", "0 0.0 1.0 0.0\n", "1 0.0 1.0 0.0\n", "2 1.0 0.0 0.0\n", "3 0.0 0.0 1.0\n", "4 1.0 0.0 0.0\n", "5 0.0 1.0 0.0" ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dummies = pd.get_dummies(df['key'], prefix='key')\n", "dummies" ] }, { "cell_type": "code", "execution_count": 91, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
data1key_akey_bkey_c
000.01.00.0
110.01.00.0
221.00.00.0
330.00.01.0
441.00.00.0
550.01.00.0
\n", "
" ], "text/plain": [ " data1 key_a key_b key_c\n", "0 0 0.0 1.0 0.0\n", "1 1 0.0 1.0 0.0\n", "2 2 1.0 0.0 0.0\n", "3 3 0.0 0.0 1.0\n", "4 4 1.0 0.0 0.0\n", "5 5 0.0 1.0 0.0" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_with_dummy = df[['data1']].join(dummies)\n", "df_with_dummy" ] }, { "cell_type": "code", "execution_count": 93, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdtitlegenres
01Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy
12Jumanji (1995)Adventure|Children|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama|Romance
45Father of the Bride Part II (1995)Comedy
\n", "
" ], "text/plain": [ " movieId title \\\n", "0 1 Toy Story (1995) \n", "1 2 Jumanji (1995) \n", "2 3 Grumpier Old Men (1995) \n", "3 4 Waiting to Exhale (1995) \n", "4 5 Father of the Bride Part II (1995) \n", "\n", " genres \n", "0 Adventure|Animation|Children|Comedy|Fantasy \n", "1 Adventure|Children|Fantasy \n", "2 Comedy|Romance \n", "3 Comedy|Drama|Romance \n", "4 Comedy " ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# reading the movie data\n", "movies = pd.read_csv(\"dataset/movies.csv\")\n", "movies.head()" ] }, { "cell_type": "code", "execution_count": 105, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Adding indicator variables for each genre requires a little bit of wrangling. First, we\n", "# extract the list of unique genres in the dataset (using a nice set.union trick)\n", "\n", "genre_iter = (set(x.split('|')) for x in movies.genres)\n", "genres = sorted(set.union(*genre_iter))\n", "\n", "# for i in genre_iter:\n", "# print(i)" ] }, { "cell_type": "code", "execution_count": 106, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "['(no genres listed)',\n", " 'Action',\n", " 'Adventure',\n", " 'Animation',\n", " 'Children',\n", " 'Comedy',\n", " 'Crime',\n", " 'Documentary',\n", " 'Drama',\n", " 'Fantasy',\n", " 'Film-Noir',\n", " 'Horror',\n", " 'IMAX',\n", " 'Musical',\n", " 'Mystery',\n", " 'Romance',\n", " 'Sci-Fi',\n", " 'Thriller',\n", " 'War',\n", " 'Western']" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "genres" ] }, { "cell_type": "code", "execution_count": 107, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dummies = pd.DataFrame(np.zeros((len(movies), len(genres))), columns=genres)" ] }, { "cell_type": "code", "execution_count": 108, "metadata": { "collapsed": true }, "outputs": [], "source": [ "for i, gen in enumerate(movies.genres):\n", " dummies.ix[i, gen.split('|')] = 1" ] }, { "cell_type": "code", "execution_count": 109, "metadata": { "collapsed": true }, "outputs": [], "source": [ "movies_windic = movies.join(dummies.add_prefix('Genre_'))" ] }, { "cell_type": "code", "execution_count": 110, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "movieId 1\n", "title Toy Story (1995)\n", "genres Adventure|Animation|Children|Comedy|Fantasy\n", "Genre_(no genres listed) 0\n", "Genre_Action 0\n", "Genre_Adventure 1\n", "Genre_Animation 1\n", "Genre_Children 1\n", "Genre_Comedy 1\n", "Genre_Crime 0\n", "Genre_Documentary 0\n", "Genre_Drama 0\n", "Genre_Fantasy 1\n", "Genre_Film-Noir 0\n", "Genre_Horror 0\n", "Genre_IMAX 0\n", "Genre_Musical 0\n", "Genre_Mystery 0\n", "Genre_Romance 0\n", "Genre_Sci-Fi 0\n", "Genre_Thriller 0\n", "Genre_War 0\n", "Genre_Western 0\n", "Name: 0, dtype: object" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_windic.ix[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [conda root]", "language": "python", "name": "conda-root-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }