{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Transformation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. Removing Duplicates"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" k1 | \n",
" k2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" one | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" one | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" one | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" two | \n",
" 3 | \n",
"
\n",
" \n",
" 4 | \n",
" two | \n",
" 3 | \n",
"
\n",
" \n",
" 5 | \n",
" two | \n",
" 4 | \n",
"
\n",
" \n",
" 6 | \n",
" two | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" k1 k2\n",
"0 one 1\n",
"1 one 1\n",
"2 one 2\n",
"3 two 3\n",
"4 two 3\n",
"5 two 4\n",
"6 two 4"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.DataFrame({'k1': ['one'] * 3 + ['two'] * 4,\n",
" 'k2': [1, 1, 2, 3, 3, 4, 4]})\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 False\n",
"1 True\n",
"2 False\n",
"3 False\n",
"4 True\n",
"5 False\n",
"6 True\n",
"dtype: bool"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# The DataFrame method duplicated returns a boolean Series indicating whether each\n",
"# row is a duplicate or not\n",
"\n",
"data.duplicated()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" k1 | \n",
" k2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" one | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" one | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" two | \n",
" 3 | \n",
"
\n",
" \n",
" 5 | \n",
" two | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" k1 k2\n",
"0 one 1\n",
"2 one 2\n",
"3 two 3\n",
"5 two 4"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# drop_duplicates drop the duplicates\n",
"data.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" k1 | \n",
" k2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" one | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" two | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" k1 k2\n",
"0 one 1\n",
"3 two 3"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Both of these methods by default consider all of the columns; \n",
"# until we defined a specific column name\n",
"data.drop_duplicates(['k1'])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" k1 | \n",
" k2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" one | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" one | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" two | \n",
" 3 | \n",
"
\n",
" \n",
" 5 | \n",
" two | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" k1 k2\n",
"0 one 1\n",
"2 one 2\n",
"3 two 3\n",
"5 two 4"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.drop_duplicates(['k2'])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\tools\\Anaconda3\\lib\\site-packages\\ipykernel\\__main__.py:4: FutureWarning: the take_last=True keyword is deprecated, use keep='last' instead\n"
]
},
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" k1 | \n",
" k2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" one | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" one | \n",
" 2 | \n",
"
\n",
" \n",
" 4 | \n",
" two | \n",
" 3 | \n",
"
\n",
" \n",
" 6 | \n",
" two | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" k1 k2\n",
"1 one 1\n",
"2 one 2\n",
"4 two 3\n",
"6 two 4"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# duplicated and drop_duplicates by default keep the first observed value combination.\n",
"# Passing take_last=True will return the last one:\n",
"\n",
"data.drop_duplicates(['k1', 'k2'], take_last=True)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" k1 | \n",
" k2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" one | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" one | \n",
" 2 | \n",
"
\n",
" \n",
" 4 | \n",
" two | \n",
" 3 | \n",
"
\n",
" \n",
" 6 | \n",
" two | \n",
" 4 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" k1 k2\n",
"1 one 1\n",
"2 one 2\n",
"4 two 3\n",
"6 two 4"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.drop_duplicates(['k1', 'k2'], keep='last')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2. Transforming Data Using a Function or Mapping"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" food | \n",
" ounces | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" bacon | \n",
" 4.0 | \n",
"
\n",
" \n",
" 1 | \n",
" pulled pork | \n",
" 3.0 | \n",
"
\n",
" \n",
" 2 | \n",
" bacon | \n",
" 12.0 | \n",
"
\n",
" \n",
" 3 | \n",
" Pastrami | \n",
" 6.0 | \n",
"
\n",
" \n",
" 4 | \n",
" corned beef | \n",
" 7.5 | \n",
"
\n",
" \n",
" 5 | \n",
" Bacon | \n",
" 8.0 | \n",
"
\n",
" \n",
" 6 | \n",
" pastrami | \n",
" 3.0 | \n",
"
\n",
" \n",
" 7 | \n",
" honey ham | \n",
" 5.0 | \n",
"
\n",
" \n",
" 8 | \n",
" nova lox | \n",
" 6.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" food ounces\n",
"0 bacon 4.0\n",
"1 pulled pork 3.0\n",
"2 bacon 12.0\n",
"3 Pastrami 6.0\n",
"4 corned beef 7.5\n",
"5 Bacon 8.0\n",
"6 pastrami 3.0\n",
"7 honey ham 5.0\n",
"8 nova lox 6.0"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',\n",
" 'corned beef', 'Bacon', 'pastrami', 'honey ham',\n",
" 'nova lox'],\n",
" 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'bacon': 'pig',\n",
" 'corned beef': 'cow',\n",
" 'honey ham': 'pig',\n",
" 'nova lox': 'salmon',\n",
" 'pastrami': 'cow',\n",
" 'pulled pork': 'pig'}"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"meat_to_animal = {\n",
" 'bacon': 'pig',\n",
" 'pulled pork': 'pig',\n",
" 'pastrami': 'cow',\n",
" 'corned beef': 'cow',\n",
" 'honey ham': 'pig',\n",
" 'nova lox': 'salmon'\n",
"}\n",
"meat_to_animal"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" food | \n",
" ounces | \n",
" animal | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" bacon | \n",
" 4.0 | \n",
" pig | \n",
"
\n",
" \n",
" 1 | \n",
" pulled pork | \n",
" 3.0 | \n",
" pig | \n",
"
\n",
" \n",
" 2 | \n",
" bacon | \n",
" 12.0 | \n",
" pig | \n",
"
\n",
" \n",
" 3 | \n",
" Pastrami | \n",
" 6.0 | \n",
" cow | \n",
"
\n",
" \n",
" 4 | \n",
" corned beef | \n",
" 7.5 | \n",
" cow | \n",
"
\n",
" \n",
" 5 | \n",
" Bacon | \n",
" 8.0 | \n",
" pig | \n",
"
\n",
" \n",
" 6 | \n",
" pastrami | \n",
" 3.0 | \n",
" cow | \n",
"
\n",
" \n",
" 7 | \n",
" honey ham | \n",
" 5.0 | \n",
" pig | \n",
"
\n",
" \n",
" 8 | \n",
" nova lox | \n",
" 6.0 | \n",
" salmon | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" food ounces animal\n",
"0 bacon 4.0 pig\n",
"1 pulled pork 3.0 pig\n",
"2 bacon 12.0 pig\n",
"3 Pastrami 6.0 cow\n",
"4 corned beef 7.5 cow\n",
"5 Bacon 8.0 pig\n",
"6 pastrami 3.0 cow\n",
"7 honey ham 5.0 pig\n",
"8 nova lox 6.0 salmon"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['animal'] = data['food'].map(str.lower).map(meat_to_animal)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 pig\n",
"1 pig\n",
"2 pig\n",
"3 cow\n",
"4 cow\n",
"5 pig\n",
"6 cow\n",
"7 pig\n",
"8 salmon\n",
"Name: food, dtype: object"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['food'].map(lambda x: meat_to_animal[x.lower()])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Replacing Values"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 1.0\n",
"1 -999.0\n",
"2 2.0\n",
"3 -999.0\n",
"4 -1000.0\n",
"5 3.0\n",
"dtype: float64"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.Series([1., -999., 2., -999., -1000., 3.])\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 1.0\n",
"1 NaN\n",
"2 2.0\n",
"3 NaN\n",
"4 -1000.0\n",
"5 3.0\n",
"dtype: float64"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# The -999 values might be sentinel values for missing data. To replace these with NA\n",
"# values that pandas understands, we can use replace\n",
"\n",
"data.replace(-999, np.nan)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 1.0\n",
"1 NaN\n",
"2 2.0\n",
"3 NaN\n",
"4 NaN\n",
"5 3.0\n",
"dtype: float64"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# replace multiple values at once\n",
"\n",
"data.replace([-999,-1000], np.nan)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 1.0\n",
"1 NaN\n",
"2 2.0\n",
"3 NaN\n",
"4 0.0\n",
"5 3.0\n",
"dtype: float64"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To use a different replacement for each value, pass a list of substitutes:\n",
"data.replace([-999,-1000], [np.nan, 0])"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 1.0\n",
"1 NaN\n",
"2 2.0\n",
"3 NaN\n",
"4 0.0\n",
"5 3.0\n",
"dtype: float64"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# The argument passed can also be a dict:\n",
"data.replace({-999: np.nan, -1000: 0})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Renaming Axis Indexes"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
" four | \n",
"
\n",
" \n",
" \n",
" \n",
" Ohio | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" Colorado | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
"
\n",
" \n",
" New York | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three four\n",
"Ohio 0 1 2 3\n",
"Colorado 4 5 6 7\n",
"New York 8 9 10 11"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.DataFrame(np.arange(12).reshape((3, 4)),\n",
" index=['Ohio', 'Colorado', 'New York'],\n",
" columns=['one', 'two', 'three', 'four'])\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array(['OHIO', 'COLORADO', 'NEW YORK'], dtype=object)"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.index.map(str.upper)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
" four | \n",
"
\n",
" \n",
" \n",
" \n",
" OHIO | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" COLORADO | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
"
\n",
" \n",
" NEW YORK | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three four\n",
"OHIO 0 1 2 3\n",
"COLORADO 4 5 6 7\n",
"NEW YORK 8 9 10 11"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.index = data.index.map(str.upper)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ONE | \n",
" TWO | \n",
" THREE | \n",
" FOUR | \n",
"
\n",
" \n",
" \n",
" \n",
" Ohio | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" Colorado | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
"
\n",
" \n",
" New York | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ONE TWO THREE FOUR\n",
"Ohio 0 1 2 3\n",
"Colorado 4 5 6 7\n",
"New York 8 9 10 11"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# o create a transformed version of a data set without modifying the original,\n",
"# a useful method is rename:\n",
"data.rename(index=str.title, columns=str.upper)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" peekaboo | \n",
" four | \n",
"
\n",
" \n",
" \n",
" \n",
" INDIANA | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" COLORADO | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
"
\n",
" \n",
" NEW YORK | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two peekaboo four\n",
"INDIANA 0 1 2 3\n",
"COLORADO 4 5 6 7\n",
"NEW YORK 8 9 10 11"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# rename can be used in conjunction with a dict-like object providing new values\n",
"# for a subset of the axis labels\n",
"\n",
"data.rename(index={'OHIO': 'INDIANA'},\n",
" columns={'three': 'peekaboo'})"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
" four | \n",
"
\n",
" \n",
" \n",
" \n",
" INDIANA | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" COLORADO | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
"
\n",
" \n",
" NEW YORK | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three four\n",
"INDIANA 0 1 2 3\n",
"COLORADO 4 5 6 7\n",
"NEW YORK 8 9 10 11"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# rename saves having to copy the DataFrame manually and assign to its index and columns \n",
"# attributes. Should you wish to modify a data set in place, pass inplace=True\n",
"\n",
"_ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True)\n",
"data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Discretization and Binning"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Continuous data is often discretized or otherwised separated into “bins” for analysis.\n",
"\n",
"ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]\n",
"ages"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[18, 25, 35, 60, 100]"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bins = [18, 25, 35, 60, 100]\n",
"bins"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]\n",
"Length: 12\n",
"Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cats = pd.cut(ages, bins)\n",
"cats"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\tools\\Anaconda3\\lib\\site-packages\\ipykernel\\__main__.py:1: FutureWarning: 'labels' is deprecated. Use 'codes' instead\n",
" if __name__ == '__main__':\n"
]
},
{
"data": {
"text/plain": [
"array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cats.labels"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cats.codes"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[18, 18, 18, 25, 18, 18, 35, 25, 60, 35, 35, 25]"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[bins[i] for i in cats.codes]"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\tools\\Anaconda3\\lib\\site-packages\\ipykernel\\__main__.py:1: FutureWarning: Accessing 'levels' is deprecated, use 'categories'\n",
" if __name__ == '__main__':\n"
]
},
{
"data": {
"text/plain": [
"Index(['(18, 25]', '(25, 35]', '(35, 60]', '(60, 100]'], dtype='object')"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cats.levels"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['(18, 25]', '(25, 35]', '(35, 60]', '(60, 100]'], dtype='object')"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cats.categories"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(18, 25] 5\n",
"(35, 60] 3\n",
"(25, 35] 3\n",
"(60, 100] 1\n",
"dtype: int64"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.value_counts(cats)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]\n",
"Length: 12\n",
"Categories (4, object): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Consistent with mathematical notation for intervals, a parenthesis means that the side\n",
"# is open while the square bracket means it is closed (inclusive). Which side is closed can\n",
"# be changed by passing right=False:\n",
"pd.cut(ages, [18, 26, 36, 61, 100], right=False)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]\n",
"Length: 12\n",
"Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# You can also pass your own bin names by passing a list or array to the labels option:\n",
"group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']\n",
"pd.cut(ages, bins, labels=group_names)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.30040941, 0.42335852, 0.73095696, 0.74586086, 0.99197458,\n",
" 0.10334217, 0.53164815, 0.63212415, 0.29426088, 0.6753901 ,\n",
" 0.32015222, 0.26670971, 0.32528884, 0.51031056, 0.33031443,\n",
" 0.59880679, 0.69966579, 0.32159514, 0.71322471, 0.82226222])"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# If you pass cut a integer number of bins instead of explicit bin edges, it will compute\n",
"# equal-length bins based on the minimum and maximum values in the data. \n",
"data = np.random.rand(20)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(0.1, 0.33], (0.33, 0.55], (0.55, 0.77], (0.55, 0.77], (0.77, 0.99], ..., (0.55, 0.77], (0.55, 0.77], (0.1, 0.33], (0.55, 0.77], (0.77, 0.99]]\n",
"Length: 20\n",
"Categories (4, object): [(0.1, 0.33] < (0.33, 0.55] < (0.55, 0.77] < (0.77, 0.99]]"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat=pd.cut(data, 4, precision=2)\n",
"cat"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['(0.1, 0.33]', '(0.33, 0.55]', '(0.55, 0.77]', '(0.77, 0.99]'], dtype='object')"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat.categories"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 1, 2, 2, 3, 0, 1, 2, 0, 2, 0, 0, 0, 1, 1, 2, 2, 0, 2, 3], dtype=int8)"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat.codes"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(0.55, 0.77] 7\n",
"(0.1, 0.33] 7\n",
"(0.33, 0.55] 4\n",
"(0.77, 0.99] 2\n",
"dtype: int64"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.value_counts(cat)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(0.23, 0.36], (0.36, 0.48], (0.61, 0.74], (0.74, 0.87], (0.87, 0.99], ..., (0.48, 0.61], (0.61, 0.74], (0.23, 0.36], (0.61, 0.74], (0.74, 0.87]]\n",
"Length: 20\n",
"Categories (7, object): [(0.1, 0.23] < (0.23, 0.36] < (0.36, 0.48] < (0.48, 0.61] < (0.61, 0.74] < (0.74, 0.87] < (0.87, 0.99]]"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat=pd.cut(data, 7, precision=2)\n",
"cat"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(0.23, 0.36] 7\n",
"(0.61, 0.74] 5\n",
"(0.48, 0.61] 3\n",
"(0.74, 0.87] 2\n",
"(0.87, 0.99] 1\n",
"(0.36, 0.48] 1\n",
"(0.1, 0.23] 1\n",
"dtype: int64"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.value_counts(cat)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(-0.605, 0.0514], (-0.605, 0.0514], [-3.296, -0.605], [-3.296, -0.605], [-3.296, -0.605], ..., (0.0514, 0.735], (0.735, 3.246], (0.0514, 0.735], (-0.605, 0.0514], (0.735, 3.246]]\n",
"Length: 1000\n",
"Categories (4, object): [[-3.296, -0.605] < (-0.605, 0.0514] < (0.0514, 0.735] < (0.735, 3.246]]"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# A closely related function, qcut, bins the data based on sample quantiles.\n",
"data = np.random.randn(1000)\n",
"\n",
"cats = pd.qcut(data, 4) # Cut into quartiles\n",
"cats"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(0.735, 3.246] 250\n",
"(0.0514, 0.735] 250\n",
"(-0.605, 0.0514] 250\n",
"[-3.296, -0.605] 250\n",
"dtype: int64"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.value_counts(cats)"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(-1.251, 0.0514], (-1.251, 0.0514], [-3.296, -1.251], [-3.296, -1.251], [-3.296, -1.251], ..., (0.0514, 1.281], (1.281, 3.246], (0.0514, 1.281], (-1.251, 0.0514], (1.281, 3.246]]\n",
"Length: 1000\n",
"Categories (4, object): [[-3.296, -1.251] < (-1.251, 0.0514] < (0.0514, 1.281] < (1.281, 3.246]]"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Similar to cut you can pass your own quantiles (numbers between 0 and 1, inclusive):\n",
"cat=pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])\n",
"cat"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(0.0514, 1.281] 400\n",
"(-1.251, 0.0514] 400\n",
"(1.281, 3.246] 100\n",
"[-3.296, -1.251] 100\n",
"dtype: int64"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.value_counts(cat)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Detecting and Filtering Outliers"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 1000.000000 | \n",
" 1000.000000 | \n",
" 1000.000000 | \n",
" 1000.000000 | \n",
"
\n",
" \n",
" mean | \n",
" -0.067684 | \n",
" 0.067924 | \n",
" 0.025598 | \n",
" -0.002298 | \n",
"
\n",
" \n",
" std | \n",
" 0.998035 | \n",
" 0.992106 | \n",
" 1.006835 | \n",
" 0.996794 | \n",
"
\n",
" \n",
" min | \n",
" -3.428254 | \n",
" -3.548824 | \n",
" -3.184377 | \n",
" -3.745356 | \n",
"
\n",
" \n",
" 25% | \n",
" -0.774890 | \n",
" -0.591841 | \n",
" -0.641675 | \n",
" -0.644144 | \n",
"
\n",
" \n",
" 50% | \n",
" -0.116401 | \n",
" 0.101143 | \n",
" 0.002073 | \n",
" -0.013611 | \n",
"
\n",
" \n",
" 75% | \n",
" 0.616366 | \n",
" 0.780282 | \n",
" 0.680391 | \n",
" 0.654328 | \n",
"
\n",
" \n",
" max | \n",
" 3.366626 | \n",
" 2.653656 | \n",
" 3.260383 | \n",
" 3.927528 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3\n",
"count 1000.000000 1000.000000 1000.000000 1000.000000\n",
"mean -0.067684 0.067924 0.025598 -0.002298\n",
"std 0.998035 0.992106 1.006835 0.996794\n",
"min -3.428254 -3.548824 -3.184377 -3.745356\n",
"25% -0.774890 -0.591841 -0.641675 -0.644144\n",
"50% -0.116401 0.101143 0.002073 -0.013611\n",
"75% 0.616366 0.780282 0.680391 0.654328\n",
"max 3.366626 2.653656 3.260383 3.927528"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Filtering or transforming outliers is largely a matter of applying array operations.\n",
"\n",
"np.random.seed(12345)\n",
"\n",
"data = pd.DataFrame(np.random.randn(1000, 4))\n",
"data.describe()"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"97 3.927528\n",
"305 -3.399312\n",
"400 -3.745356\n",
"Name: 3, dtype: float64"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Suppose you wanted to find values in one of the columns exceeding three in magnitude:\n",
"col = data[3]\n",
"\n",
"col[np.abs(col) > 3]"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" \n",
" \n",
" 5 | \n",
" -0.539741 | \n",
" 0.476985 | \n",
" 3.248944 | \n",
" -1.021228 | \n",
"
\n",
" \n",
" 97 | \n",
" -0.774363 | \n",
" 0.552936 | \n",
" 0.106061 | \n",
" 3.927528 | \n",
"
\n",
" \n",
" 102 | \n",
" -0.655054 | \n",
" -0.565230 | \n",
" 3.176873 | \n",
" 0.959533 | \n",
"
\n",
" \n",
" 305 | \n",
" -2.315555 | \n",
" 0.457246 | \n",
" -0.025907 | \n",
" -3.399312 | \n",
"
\n",
" \n",
" 324 | \n",
" 0.050188 | \n",
" 1.951312 | \n",
" 3.260383 | \n",
" 0.963301 | \n",
"
\n",
" \n",
" 400 | \n",
" 0.146326 | \n",
" 0.508391 | \n",
" -0.196713 | \n",
" -3.745356 | \n",
"
\n",
" \n",
" 499 | \n",
" -0.293333 | \n",
" -0.242459 | \n",
" -3.056990 | \n",
" 1.918403 | \n",
"
\n",
" \n",
" 523 | \n",
" -3.428254 | \n",
" -0.296336 | \n",
" -0.439938 | \n",
" -0.867165 | \n",
"
\n",
" \n",
" 586 | \n",
" 0.275144 | \n",
" 1.179227 | \n",
" -3.184377 | \n",
" 1.369891 | \n",
"
\n",
" \n",
" 808 | \n",
" -0.362528 | \n",
" -3.548824 | \n",
" 1.553205 | \n",
" -2.186301 | \n",
"
\n",
" \n",
" 900 | \n",
" 3.366626 | \n",
" -2.372214 | \n",
" 0.851010 | \n",
" 1.332846 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3\n",
"5 -0.539741 0.476985 3.248944 -1.021228\n",
"97 -0.774363 0.552936 0.106061 3.927528\n",
"102 -0.655054 -0.565230 3.176873 0.959533\n",
"305 -2.315555 0.457246 -0.025907 -3.399312\n",
"324 0.050188 1.951312 3.260383 0.963301\n",
"400 0.146326 0.508391 -0.196713 -3.745356\n",
"499 -0.293333 -0.242459 -3.056990 1.918403\n",
"523 -3.428254 -0.296336 -0.439938 -0.867165\n",
"586 0.275144 1.179227 -3.184377 1.369891\n",
"808 -0.362528 -3.548824 1.553205 -2.186301\n",
"900 3.366626 -2.372214 0.851010 1.332846"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To select all rows having a value exceeding 3 or -3, you can use the any method on a\n",
"# boolean DataFrame:\n",
"\n",
" data[(np.abs(data) > 3).any(1)]"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 1000.000000 | \n",
" 1000.000000 | \n",
" 1000.000000 | \n",
" 1000.000000 | \n",
"
\n",
" \n",
" mean | \n",
" -0.067623 | \n",
" 0.068473 | \n",
" 0.025153 | \n",
" -0.002081 | \n",
"
\n",
" \n",
" std | \n",
" 0.995485 | \n",
" 0.990253 | \n",
" 1.003977 | \n",
" 0.989736 | \n",
"
\n",
" \n",
" min | \n",
" -3.000000 | \n",
" -3.000000 | \n",
" -3.000000 | \n",
" -3.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" -0.774890 | \n",
" -0.591841 | \n",
" -0.641675 | \n",
" -0.644144 | \n",
"
\n",
" \n",
" 50% | \n",
" -0.116401 | \n",
" 0.101143 | \n",
" 0.002073 | \n",
" -0.013611 | \n",
"
\n",
" \n",
" 75% | \n",
" 0.616366 | \n",
" 0.780282 | \n",
" 0.680391 | \n",
" 0.654328 | \n",
"
\n",
" \n",
" max | \n",
" 3.000000 | \n",
" 2.653656 | \n",
" 3.000000 | \n",
" 3.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3\n",
"count 1000.000000 1000.000000 1000.000000 1000.000000\n",
"mean -0.067623 0.068473 0.025153 -0.002081\n",
"std 0.995485 0.990253 1.003977 0.989736\n",
"min -3.000000 -3.000000 -3.000000 -3.000000\n",
"25% -0.774890 -0.591841 -0.641675 -0.644144\n",
"50% -0.116401 0.101143 0.002073 -0.013611\n",
"75% 0.616366 0.780282 0.680391 0.654328\n",
"max 3.000000 2.653656 3.000000 3.000000"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# code to cap values outside the interval -3 to 3\n",
"data[np.abs(data) > 3] = np.sign(data) * 3\n",
"\n",
"data.describe()"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# data[np.abs(data) > 3]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Permutation and Random Sampling"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
"
\n",
" \n",
" 2 | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
"
\n",
" \n",
" 3 | \n",
" 12 | \n",
" 13 | \n",
" 14 | \n",
" 15 | \n",
"
\n",
" \n",
" 4 | \n",
" 16 | \n",
" 17 | \n",
" 18 | \n",
" 19 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3\n",
"0 0 1 2 3\n",
"1 4 5 6 7\n",
"2 8 9 10 11\n",
"3 12 13 14 15\n",
"4 16 17 18 19"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Permuting (randomly reordering) a Series or the rows in a DataFrame is easy to do using\n",
"# the numpy.random.permutation function. Calling permutation with the length of the axis\n",
"# you want to permute produces an array of integers indicating the new ordering:\n",
"\n",
"df = pd.DataFrame(np.arange(5 * 4).reshape(5, 4))\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([1, 3, 4, 0, 2])"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sampler = np.random.permutation(5)\n",
"sampler"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
"
\n",
" \n",
" 3 | \n",
" 12 | \n",
" 13 | \n",
" 14 | \n",
" 15 | \n",
"
\n",
" \n",
" 4 | \n",
" 16 | \n",
" 17 | \n",
" 18 | \n",
" 19 | \n",
"
\n",
" \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" 2 | \n",
" 8 | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3\n",
"1 4 5 6 7\n",
"3 12 13 14 15\n",
"4 16 17 18 19\n",
"0 0 1 2 3\n",
"2 8 9 10 11"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.take(sampler)"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
"
\n",
" \n",
" 3 | \n",
" 12 | \n",
" 13 | \n",
" 14 | \n",
" 15 | \n",
"
\n",
" \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3\n",
"1 4 5 6 7\n",
"3 12 13 14 15\n",
"0 0 1 2 3"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To select a random subset without replacement, one way is to slice off the first k ele-\n",
"# ments of the array returned by permutation, where k is the desired subset size. There\n",
"# are much more efficient sampling-without-replacement algorithms, but this is an easy\n",
"# strategy that uses readily available tools\n",
"\n",
"df.take(np.random.permutation(len(df))[:3])"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([2, 2, 0, 3, 0, 4, 1, 1, 2, 3])"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To generate a sample with replacement, the fastest way is to use np.random.randint to\n",
"# draw random integers:\n",
"bag = np.array([5, 7, -1, 6, 4])\n",
"\n",
"sampler = np.random.randint(0, len(bag), size=10)\n",
"sampler"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([-1, -1, 5, 6, 5, 4, 7, 7, -1, 6])"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"draws = bag.take(sampler)\n",
"draws"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Computing Indicator/Dummy Variables"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Another type of transformation for statistical modeling or machine learning applica-\n",
"tions is converting a categorical variable into a “dummy” or “indicator” matrix. If a\n",
"column in a DataFrame has k distinct values, you would derive a matrix or DataFrame\n",
"containing k columns containing all 1’s and 0’s. pandas has a get_dummies function for\n",
"doing this, though devising one yourself is not difficult. Let’s return to an earlier ex-\n",
"ample DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" data1 | \n",
" key | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" b | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" b | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" a | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" c | \n",
"
\n",
" \n",
" 4 | \n",
" 4 | \n",
" a | \n",
"
\n",
" \n",
" 5 | \n",
" 5 | \n",
" b | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" data1 key\n",
"0 0 b\n",
"1 1 b\n",
"2 2 a\n",
"3 3 c\n",
"4 4 a\n",
"5 5 b"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],\n",
" 'data1': range(6)})\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c\n",
"0 0.0 1.0 0.0\n",
"1 0.0 1.0 0.0\n",
"2 1.0 0.0 0.0\n",
"3 0.0 0.0 1.0\n",
"4 1.0 0.0 0.0\n",
"5 0.0 1.0 0.0"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.get_dummies(df['key'])"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" key_a | \n",
" key_b | \n",
" key_c | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" key_a key_b key_c\n",
"0 0.0 1.0 0.0\n",
"1 0.0 1.0 0.0\n",
"2 1.0 0.0 0.0\n",
"3 0.0 0.0 1.0\n",
"4 1.0 0.0 0.0\n",
"5 0.0 1.0 0.0"
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dummies = pd.get_dummies(df['key'], prefix='key')\n",
"dummies"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" data1 | \n",
" key_a | \n",
" key_b | \n",
" key_c | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 3 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 4 | \n",
" 1.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 5 | \n",
" 0.0 | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" data1 key_a key_b key_c\n",
"0 0 0.0 1.0 0.0\n",
"1 1 0.0 1.0 0.0\n",
"2 2 1.0 0.0 0.0\n",
"3 3 0.0 0.0 1.0\n",
"4 4 1.0 0.0 0.0\n",
"5 5 0.0 1.0 0.0"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_with_dummy = df[['data1']].join(dummies)\n",
"df_with_dummy"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" movieId | \n",
" title | \n",
" genres | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Toy Story (1995) | \n",
" Adventure|Animation|Children|Comedy|Fantasy | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Jumanji (1995) | \n",
" Adventure|Children|Fantasy | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Grumpier Old Men (1995) | \n",
" Comedy|Romance | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" Waiting to Exhale (1995) | \n",
" Comedy|Drama|Romance | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" Father of the Bride Part II (1995) | \n",
" Comedy | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" movieId title \\\n",
"0 1 Toy Story (1995) \n",
"1 2 Jumanji (1995) \n",
"2 3 Grumpier Old Men (1995) \n",
"3 4 Waiting to Exhale (1995) \n",
"4 5 Father of the Bride Part II (1995) \n",
"\n",
" genres \n",
"0 Adventure|Animation|Children|Comedy|Fantasy \n",
"1 Adventure|Children|Fantasy \n",
"2 Comedy|Romance \n",
"3 Comedy|Drama|Romance \n",
"4 Comedy "
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# reading the movie data\n",
"movies = pd.read_csv(\"dataset/movies.csv\")\n",
"movies.head()"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Adding indicator variables for each genre requires a little bit of wrangling. First, we\n",
"# extract the list of unique genres in the dataset (using a nice set.union trick)\n",
"\n",
"genre_iter = (set(x.split('|')) for x in movies.genres)\n",
"genres = sorted(set.union(*genre_iter))\n",
"\n",
"# for i in genre_iter:\n",
"# print(i)"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['(no genres listed)',\n",
" 'Action',\n",
" 'Adventure',\n",
" 'Animation',\n",
" 'Children',\n",
" 'Comedy',\n",
" 'Crime',\n",
" 'Documentary',\n",
" 'Drama',\n",
" 'Fantasy',\n",
" 'Film-Noir',\n",
" 'Horror',\n",
" 'IMAX',\n",
" 'Musical',\n",
" 'Mystery',\n",
" 'Romance',\n",
" 'Sci-Fi',\n",
" 'Thriller',\n",
" 'War',\n",
" 'Western']"
]
},
"execution_count": 106,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"genres"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dummies = pd.DataFrame(np.zeros((len(movies), len(genres))), columns=genres)"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"for i, gen in enumerate(movies.genres):\n",
" dummies.ix[i, gen.split('|')] = 1"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"movies_windic = movies.join(dummies.add_prefix('Genre_'))"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"movieId 1\n",
"title Toy Story (1995)\n",
"genres Adventure|Animation|Children|Comedy|Fantasy\n",
"Genre_(no genres listed) 0\n",
"Genre_Action 0\n",
"Genre_Adventure 1\n",
"Genre_Animation 1\n",
"Genre_Children 1\n",
"Genre_Comedy 1\n",
"Genre_Crime 0\n",
"Genre_Documentary 0\n",
"Genre_Drama 0\n",
"Genre_Fantasy 1\n",
"Genre_Film-Noir 0\n",
"Genre_Horror 0\n",
"Genre_IMAX 0\n",
"Genre_Musical 0\n",
"Genre_Mystery 0\n",
"Genre_Romance 0\n",
"Genre_Sci-Fi 0\n",
"Genre_Thriller 0\n",
"Genre_War 0\n",
"Genre_Western 0\n",
"Name: 0, dtype: object"
]
},
"execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies_windic.ix[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda root]",
"language": "python",
"name": "conda-root-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}