{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Cleaning and Preparation"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Missing data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 aardvark\n",
"1 artichoke\n",
"2 NaN\n",
"3 avocado\n",
"dtype: object"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"string_data = pd.Series([\"aardvark\", \"artichoke\", np.nan, \"avocado\"])\n",
"string_data"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 False\n",
"1 False\n",
"2 True\n",
"3 False\n",
"dtype: bool"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check for NAs i.e \"not available\" or NaN i.e \"not a number\"\n",
"string_data.isnull()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 None\n",
"1 artichoke\n",
"2 NaN\n",
"3 avocado\n",
"dtype: object"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"string_data[0] = None\n",
"string_data"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1 artichoke\n",
"3 avocado\n",
"dtype: object"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"string_data.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 7.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 6.0 | \n",
" NaN | \n",
" NaN | \n",
" 7.0 | \n",
"
\n",
" \n",
" 2 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 4.0 | \n",
" 5.0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 6.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5\n",
"0 1.0 2.0 3.0 4.0 5.0 7.0\n",
"1 4.0 5.0 6.0 NaN NaN 7.0\n",
"2 NaN NaN NaN NaN NaN NaN\n",
"3 4.0 5.0 NaN NaN NaN 6.0"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from numpy import nan as NA\n",
"df = pd.DataFrame([[1, 2, 3, 4, 5, 7], \n",
" [4, 5, 6, None, NA, 7],\n",
" [NA, NA, NA, NA, NA, NA],\n",
" [4, 5, NA, NA, None, 6]])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 7.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5\n",
"0 1.0 2.0 3.0 4.0 5.0 7.0"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 7.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 6.0 | \n",
" NaN | \n",
" NaN | \n",
" 7.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 4.0 | \n",
" 5.0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 6.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5\n",
"0 1.0 2.0 3.0 4.0 5.0 7.0\n",
"1 4.0 5.0 6.0 NaN NaN 7.0\n",
"3 4.0 5.0 NaN NaN NaN 6.0"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dropna(how = \"all\")"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 7.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 6.0 | \n",
" NaN | \n",
" NaN | \n",
" 7.0 | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 4.0 | \n",
" 5.0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 6.0 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5 6\n",
"0 1.0 2.0 3.0 4.0 5.0 7.0 NaN\n",
"1 4.0 5.0 6.0 NaN NaN 7.0 NaN\n",
"2 NaN NaN NaN NaN NaN NaN NaN\n",
"3 4.0 5.0 NaN NaN NaN 6.0 NaN"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[6] = NA\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 7.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 6.0 | \n",
" NaN | \n",
" NaN | \n",
" 7.0 | \n",
"
\n",
" \n",
" 2 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 4.0 | \n",
" 5.0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 6.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5\n",
"0 1.0 2.0 3.0 4.0 5.0 7.0\n",
"1 4.0 5.0 6.0 NaN NaN 7.0\n",
"2 NaN NaN NaN NaN NaN NaN\n",
"3 4.0 5.0 NaN NaN NaN 6.0"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dropna(axis = 1, how = \"all\")"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 7.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 6.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 7.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 6.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5 6\n",
"0 1.0 2.0 3.0 4.0 5.0 7.0 0.0\n",
"1 4.0 5.0 6.0 0.0 0.0 7.0 0.0\n",
"2 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n",
"3 4.0 5.0 0.0 0.0 0.0 6.0 0.0"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = df.fillna(0)\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1.0 | \n",
" 2.0 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 7.000000 | \n",
" NaN | \n",
"
\n",
" \n",
" 1 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 6.0 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 7.000000 | \n",
" NaN | \n",
"
\n",
" \n",
" 2 | \n",
" 3.0 | \n",
" 4.0 | \n",
" 4.5 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 6.666667 | \n",
" NaN | \n",
"
\n",
" \n",
" 3 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 4.5 | \n",
" 4.0 | \n",
" 5.0 | \n",
" 6.000000 | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5 6\n",
"0 1.0 2.0 3.0 4.0 5.0 7.000000 NaN\n",
"1 4.0 5.0 6.0 4.0 5.0 7.000000 NaN\n",
"2 3.0 4.0 4.5 4.0 5.0 6.666667 NaN\n",
"3 4.0 5.0 4.5 4.0 5.0 6.000000 NaN"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3 = df.fillna(df.mean())\n",
"df3"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data transformation"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" h1 | \n",
" h2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" one | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" two | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
" three | \n",
" 2 | \n",
"
\n",
" \n",
" 3 | \n",
" one | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" two | \n",
" 5 | \n",
"
\n",
" \n",
" 5 | \n",
" three | \n",
" 5 | \n",
"
\n",
" \n",
" 6 | \n",
" one | \n",
" 2 | \n",
"
\n",
" \n",
" 7 | \n",
" two | \n",
" 2 | \n",
"
\n",
" \n",
" 8 | \n",
" three | \n",
" 5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" h1 h2\n",
"0 one 1\n",
"1 two 2\n",
"2 three 2\n",
"3 one 1\n",
"4 two 5\n",
"5 three 5\n",
"6 one 2\n",
"7 two 2\n",
"8 three 5"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame({\"h1\" : [\"one\", \"two\", \"three\"] * 3, \n",
" \"h2\" : [1, 2, 2, 1, 5, 5, 2, 2, 5]})\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 False\n",
"1 False\n",
"2 False\n",
"3 True\n",
"4 False\n",
"5 False\n",
"6 False\n",
"7 True\n",
"8 True\n",
"dtype: bool"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.duplicated()"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" h1 | \n",
" h2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" one | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" two | \n",
" 2 | \n",
"
\n",
" \n",
" 2 | \n",
" three | \n",
" 2 | \n",
"
\n",
" \n",
" 4 | \n",
" two | \n",
" 5 | \n",
"
\n",
" \n",
" 5 | \n",
" three | \n",
" 5 | \n",
"
\n",
" \n",
" 6 | \n",
" one | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" h1 h2\n",
"0 one 1\n",
"1 two 2\n",
"2 three 2\n",
"4 two 5\n",
"5 three 5\n",
"6 one 2"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = df.drop_duplicates()\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 one\n",
"1 two\n",
"2 three\n",
"Name: h1, dtype: object"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = df[\"h1\"].drop_duplicates()\n",
"df2"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" food | \n",
" ounces | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" bacon | \n",
" 4.0 | \n",
"
\n",
" \n",
" 1 | \n",
" pulled pork | \n",
" 3.0 | \n",
"
\n",
" \n",
" 2 | \n",
" bacon | \n",
" 12.0 | \n",
"
\n",
" \n",
" 3 | \n",
" Pastrami | \n",
" 6.0 | \n",
"
\n",
" \n",
" 4 | \n",
" corned beef | \n",
" 7.5 | \n",
"
\n",
" \n",
" 5 | \n",
" Bacon | \n",
" 8.0 | \n",
"
\n",
" \n",
" 6 | \n",
" pastrami | \n",
" 3.0 | \n",
"
\n",
" \n",
" 7 | \n",
" honey ham | \n",
" 5.0 | \n",
"
\n",
" \n",
" 8 | \n",
" nova lox | \n",
" 6.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" food ounces\n",
"0 bacon 4.0\n",
"1 pulled pork 3.0\n",
"2 bacon 12.0\n",
"3 Pastrami 6.0\n",
"4 corned beef 7.5\n",
"5 Bacon 8.0\n",
"6 pastrami 3.0\n",
"7 honey ham 5.0\n",
"8 nova lox 6.0"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon','Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],\n",
" 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"meat_to_animal = { \n",
" 'bacon': 'pig', \n",
" 'pulled pork': 'pig', \n",
" 'pastrami': 'cow', \n",
" 'corned beef': 'cow', \n",
" 'honey ham': 'pig', \n",
" 'nova lox': 'salmon'}"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 bacon\n",
"1 pulled pork\n",
"2 bacon\n",
"3 pastrami\n",
"4 corned beef\n",
"5 bacon\n",
"6 pastrami\n",
"7 honey ham\n",
"8 nova lox\n",
"Name: food, dtype: object"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lowered = data.food.str.lower()\n",
"lowered"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" food | \n",
" ounces | \n",
" animal | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" bacon | \n",
" 4.0 | \n",
" pig | \n",
"
\n",
" \n",
" 1 | \n",
" pulled pork | \n",
" 3.0 | \n",
" pig | \n",
"
\n",
" \n",
" 2 | \n",
" bacon | \n",
" 12.0 | \n",
" pig | \n",
"
\n",
" \n",
" 3 | \n",
" Pastrami | \n",
" 6.0 | \n",
" cow | \n",
"
\n",
" \n",
" 4 | \n",
" corned beef | \n",
" 7.5 | \n",
" cow | \n",
"
\n",
" \n",
" 5 | \n",
" Bacon | \n",
" 8.0 | \n",
" pig | \n",
"
\n",
" \n",
" 6 | \n",
" pastrami | \n",
" 3.0 | \n",
" cow | \n",
"
\n",
" \n",
" 7 | \n",
" honey ham | \n",
" 5.0 | \n",
" pig | \n",
"
\n",
" \n",
" 8 | \n",
" nova lox | \n",
" 6.0 | \n",
" salmon | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" food ounces animal\n",
"0 bacon 4.0 pig\n",
"1 pulled pork 3.0 pig\n",
"2 bacon 12.0 pig\n",
"3 Pastrami 6.0 cow\n",
"4 corned beef 7.5 cow\n",
"5 Bacon 8.0 pig\n",
"6 pastrami 3.0 cow\n",
"7 honey ham 5.0 pig\n",
"8 nova lox 6.0 salmon"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[\"animal\"] = lowered.map(meat_to_animal)\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 1.0\n",
"1 -999.0\n",
"2 2.0\n",
"3 -999.0\n",
"4 -1000.0\n",
"5 3.0\n",
"dtype: float64"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.Series([1., -999., 2., -999., -1000., 3.])\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 1.0\n",
"1 NaN\n",
"2 2.0\n",
"3 NaN\n",
"4 -1000.0\n",
"5 3.0\n",
"dtype: float64"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.replace(-999, np.nan)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 1.0\n",
"1 NaN\n",
"2 2.0\n",
"3 NaN\n",
"4 NaN\n",
"5 3.0\n",
"dtype: float64"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.replace([-999, -1000], np.nan)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 1.0\n",
"1 NaN\n",
"2 2.0\n",
"3 NaN\n",
"4 0.0\n",
"5 3.0\n",
"dtype: float64"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.replace({-999 : np.nan, \n",
" -1000: 0})"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
" four | \n",
"
\n",
" \n",
" \n",
" \n",
" ohio | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
"
\n",
" \n",
" colorado | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
"
\n",
" \n",
" ny | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three four\n",
"ohio 1 2 3 4\n",
"colorado 5 6 7 8\n",
"ny 9 10 11 12"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(np.arange(start = 1, stop = 13, step = 1).reshape(3, 4))\n",
"df.index = [\"ohio\", \"colorado\", \"ny\"]\n",
"df.columns = [\"one\", \"two\", \"three\", \"four\"]\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"to_upper = lambda x: x.upper()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['OHIO', 'COLORADO', 'NY'], dtype='object')"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.index.map(to_upper)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" one | \n",
" two | \n",
" three | \n",
" four | \n",
"
\n",
" \n",
" \n",
" \n",
" OHIO | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
"
\n",
" \n",
" COLORADO | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
"
\n",
" \n",
" NY | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" one two three four\n",
"OHIO 1 2 3 4\n",
"COLORADO 5 6 7 8\n",
"NY 9 10 11 12"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.index = df.index.map(to_upper)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" One | \n",
" Two | \n",
" Three | \n",
" Four | \n",
"
\n",
" \n",
" \n",
" \n",
" OHIO | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
"
\n",
" \n",
" COLORADO | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
"
\n",
" \n",
" NY | \n",
" 9 | \n",
" 10 | \n",
" 11 | \n",
" 12 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" One Two Three Four\n",
"OHIO 1 2 3 4\n",
"COLORADO 5 6 7 8\n",
"NY 9 10 11 12"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.rename(index = str.upper, columns = str.title)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"age = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]\n",
"age"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"bins =[18, 25, 35, 60, 100]"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]\n",
"Length: 12\n",
"Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cats = pd.cut(age, bins)\n",
"cats"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]\n",
" closed='right',\n",
" dtype='interval[int64]')"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cats.categories"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cats.codes"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(18, 25] 5\n",
"(35, 60] 3\n",
"(25, 35] 3\n",
"(60, 100] 1\n",
"dtype: int64"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.value_counts(cats)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" -1.060885 | \n",
" 0.257677 | \n",
" -0.439057 | \n",
" -0.585580 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.992414 | \n",
" 0.848633 | \n",
" 0.279456 | \n",
" 1.804831 | \n",
"
\n",
" \n",
" 2 | \n",
" -0.465408 | \n",
" -1.573807 | \n",
" 0.313331 | \n",
" 0.939957 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.247694 | \n",
" -0.232562 | \n",
" -0.763437 | \n",
" 3.137660 | \n",
"
\n",
" \n",
" 4 | \n",
" -0.455083 | \n",
" 1.724922 | \n",
" -0.467447 | \n",
" 0.595734 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3\n",
"0 -1.060885 0.257677 -0.439057 -0.585580\n",
"1 0.992414 0.848633 0.279456 1.804831\n",
"2 -0.465408 -1.573807 0.313331 0.939957\n",
"3 0.247694 -0.232562 -0.763437 3.137660\n",
"4 -0.455083 1.724922 -0.467447 0.595734"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(np.random.randn(1000, 4))\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 1000.000000 | \n",
" 1000.000000 | \n",
" 1000.000000 | \n",
" 1000.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 0.024221 | \n",
" -0.005284 | \n",
" -0.007271 | \n",
" 0.058116 | \n",
"
\n",
" \n",
" std | \n",
" 0.999047 | \n",
" 0.981500 | \n",
" 1.027020 | \n",
" 0.993744 | \n",
"
\n",
" \n",
" min | \n",
" -3.060966 | \n",
" -3.082180 | \n",
" -3.907713 | \n",
" -3.536450 | \n",
"
\n",
" \n",
" 25% | \n",
" -0.651549 | \n",
" -0.667142 | \n",
" -0.709838 | \n",
" -0.641517 | \n",
"
\n",
" \n",
" 50% | \n",
" 0.028659 | \n",
" 0.023977 | \n",
" -0.028669 | \n",
" 0.048662 | \n",
"
\n",
" \n",
" 75% | \n",
" 0.683996 | \n",
" 0.725258 | \n",
" 0.675920 | \n",
" 0.739192 | \n",
"
\n",
" \n",
" max | \n",
" 3.759436 | \n",
" 2.994700 | \n",
" 3.263231 | \n",
" 3.197917 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3\n",
"count 1000.000000 1000.000000 1000.000000 1000.000000\n",
"mean 0.024221 -0.005284 -0.007271 0.058116\n",
"std 0.999047 0.981500 1.027020 0.993744\n",
"min -3.060966 -3.082180 -3.907713 -3.536450\n",
"25% -0.651549 -0.667142 -0.709838 -0.641517\n",
"50% 0.028659 0.023977 -0.028669 0.048662\n",
"75% 0.683996 0.725258 0.675920 0.739192\n",
"max 3.759436 2.994700 3.263231 3.197917"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"145 -3.907713\n",
"971 3.263231\n",
"Name: 2, dtype: float64"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[2][np.abs(df[2]) > 3]"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" \n",
" \n",
" 555 | \n",
" -0.055178 | \n",
" -0.704377 | \n",
" 1.067797 | \n",
" -0.264149 | \n",
"
\n",
" \n",
" 295 | \n",
" -0.338811 | \n",
" -0.589736 | \n",
" -0.562751 | \n",
" -0.357064 | \n",
"
\n",
" \n",
" 329 | \n",
" 0.634184 | \n",
" 0.472945 | \n",
" -0.562836 | \n",
" -0.461018 | \n",
"
\n",
" \n",
" 727 | \n",
" -0.735358 | \n",
" -2.652772 | \n",
" -2.111481 | \n",
" 0.254709 | \n",
"
\n",
" \n",
" 484 | \n",
" 1.133932 | \n",
" 0.770841 | \n",
" 1.053564 | \n",
" -1.719157 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 1 2 3\n",
"555 -0.055178 -0.704377 1.067797 -0.264149\n",
"295 -0.338811 -0.589736 -0.562751 -0.357064\n",
"329 0.634184 0.472945 -0.562836 -0.461018\n",
"727 -0.735358 -2.652772 -2.111481 0.254709\n",
"484 1.133932 0.770841 1.053564 -1.719157"
]
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sample(frac = 0.75).head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## String manipulation"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'a, b, guide'"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val = \"a, b, guide\"\n",
"val"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['a', ' b', ' guide']"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val.split(\",\")"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['a', 'b', 'guide']"
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pieces = [x.strip() for x in val.split(\",\")]\n",
"pieces"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'a__b__guide'"
]
},
"execution_count": 96,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"__\".join(pieces)"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"val.count(\"a\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Regular expressions\n",
"Flexible way to search or match string patterns"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [],
"source": [
"import re"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'foo bar\\t baz \\tqux'"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text = \"foo bar\\t baz \\tqux\"\n",
"text"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['foo', 'bar', 'baz', 'qux']"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"re.split(\"\\s+\", text)"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
"regex = re.compile(\"\\s+\")"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['foo', 'bar', 'baz', 'qux']"
]
},
"execution_count": 105,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"regex.split(text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}