{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Data Cleaning and Preparation" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Missing data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 aardvark\n", "1 artichoke\n", "2 NaN\n", "3 avocado\n", "dtype: object" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string_data = pd.Series([\"aardvark\", \"artichoke\", np.nan, \"avocado\"])\n", "string_data" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 False\n", "1 False\n", "2 True\n", "3 False\n", "dtype: bool" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check for NAs i.e \"not available\" or NaN i.e \"not a number\"\n", "string_data.isnull()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 None\n", "1 artichoke\n", "2 NaN\n", "3 avocado\n", "dtype: object" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string_data[0] = None\n", "string_data" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1 artichoke\n", "3 avocado\n", "dtype: object" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "string_data.dropna()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012345
01.02.03.04.05.07.0
14.05.06.0NaNNaN7.0
2NaNNaNNaNNaNNaNNaN
34.05.0NaNNaNNaN6.0
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5\n", "0 1.0 2.0 3.0 4.0 5.0 7.0\n", "1 4.0 5.0 6.0 NaN NaN 7.0\n", "2 NaN NaN NaN NaN NaN NaN\n", "3 4.0 5.0 NaN NaN NaN 6.0" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from numpy import nan as NA\n", "df = pd.DataFrame([[1, 2, 3, 4, 5, 7], \n", " [4, 5, 6, None, NA, 7],\n", " [NA, NA, NA, NA, NA, NA],\n", " [4, 5, NA, NA, None, 6]])\n", "df" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012345
01.02.03.04.05.07.0
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5\n", "0 1.0 2.0 3.0 4.0 5.0 7.0" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dropna()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012345
01.02.03.04.05.07.0
14.05.06.0NaNNaN7.0
34.05.0NaNNaNNaN6.0
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5\n", "0 1.0 2.0 3.0 4.0 5.0 7.0\n", "1 4.0 5.0 6.0 NaN NaN 7.0\n", "3 4.0 5.0 NaN NaN NaN 6.0" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dropna(how = \"all\")" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456
01.02.03.04.05.07.0NaN
14.05.06.0NaNNaN7.0NaN
2NaNNaNNaNNaNNaNNaNNaN
34.05.0NaNNaNNaN6.0NaN
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6\n", "0 1.0 2.0 3.0 4.0 5.0 7.0 NaN\n", "1 4.0 5.0 6.0 NaN NaN 7.0 NaN\n", "2 NaN NaN NaN NaN NaN NaN NaN\n", "3 4.0 5.0 NaN NaN NaN 6.0 NaN" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[6] = NA\n", "df" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012345
01.02.03.04.05.07.0
14.05.06.0NaNNaN7.0
2NaNNaNNaNNaNNaNNaN
34.05.0NaNNaNNaN6.0
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5\n", "0 1.0 2.0 3.0 4.0 5.0 7.0\n", "1 4.0 5.0 6.0 NaN NaN 7.0\n", "2 NaN NaN NaN NaN NaN NaN\n", "3 4.0 5.0 NaN NaN NaN 6.0" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.dropna(axis = 1, how = \"all\")" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456
01.02.03.04.05.07.00.0
14.05.06.00.00.07.00.0
20.00.00.00.00.00.00.0
34.05.00.00.00.06.00.0
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6\n", "0 1.0 2.0 3.0 4.0 5.0 7.0 0.0\n", "1 4.0 5.0 6.0 0.0 0.0 7.0 0.0\n", "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0\n", "3 4.0 5.0 0.0 0.0 0.0 6.0 0.0" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = df.fillna(0)\n", "df2" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456
01.02.03.04.05.07.000000NaN
14.05.06.04.05.07.000000NaN
23.04.04.54.05.06.666667NaN
34.05.04.54.05.06.000000NaN
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6\n", "0 1.0 2.0 3.0 4.0 5.0 7.000000 NaN\n", "1 4.0 5.0 6.0 4.0 5.0 7.000000 NaN\n", "2 3.0 4.0 4.5 4.0 5.0 6.666667 NaN\n", "3 4.0 5.0 4.5 4.0 5.0 6.000000 NaN" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3 = df.fillna(df.mean())\n", "df3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data transformation" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
h1h2
0one1
1two2
2three2
3one1
4two5
5three5
6one2
7two2
8three5
\n", "
" ], "text/plain": [ " h1 h2\n", "0 one 1\n", "1 two 2\n", "2 three 2\n", "3 one 1\n", "4 two 5\n", "5 three 5\n", "6 one 2\n", "7 two 2\n", "8 three 5" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame({\"h1\" : [\"one\", \"two\", \"three\"] * 3, \n", " \"h2\" : [1, 2, 2, 1, 5, 5, 2, 2, 5]})\n", "df" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 False\n", "1 False\n", "2 False\n", "3 True\n", "4 False\n", "5 False\n", "6 False\n", "7 True\n", "8 True\n", "dtype: bool" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.duplicated()" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
h1h2
0one1
1two2
2three2
4two5
5three5
6one2
\n", "
" ], "text/plain": [ " h1 h2\n", "0 one 1\n", "1 two 2\n", "2 three 2\n", "4 two 5\n", "5 three 5\n", "6 one 2" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = df.drop_duplicates()\n", "df2" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 one\n", "1 two\n", "2 three\n", "Name: h1, dtype: object" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = df[\"h1\"].drop_duplicates()\n", "df2" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
foodounces
0bacon4.0
1pulled pork3.0
2bacon12.0
3Pastrami6.0
4corned beef7.5
5Bacon8.0
6pastrami3.0
7honey ham5.0
8nova lox6.0
\n", "
" ], "text/plain": [ " food ounces\n", "0 bacon 4.0\n", "1 pulled pork 3.0\n", "2 bacon 12.0\n", "3 Pastrami 6.0\n", "4 corned beef 7.5\n", "5 Bacon 8.0\n", "6 pastrami 3.0\n", "7 honey ham 5.0\n", "8 nova lox 6.0" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon','Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],\n", " 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})\n", "data" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "meat_to_animal = { \n", " 'bacon': 'pig', \n", " 'pulled pork': 'pig', \n", " 'pastrami': 'cow', \n", " 'corned beef': 'cow', \n", " 'honey ham': 'pig', \n", " 'nova lox': 'salmon'}" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 bacon\n", "1 pulled pork\n", "2 bacon\n", "3 pastrami\n", "4 corned beef\n", "5 bacon\n", "6 pastrami\n", "7 honey ham\n", "8 nova lox\n", "Name: food, dtype: object" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "lowered = data.food.str.lower()\n", "lowered" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
foodouncesanimal
0bacon4.0pig
1pulled pork3.0pig
2bacon12.0pig
3Pastrami6.0cow
4corned beef7.5cow
5Bacon8.0pig
6pastrami3.0cow
7honey ham5.0pig
8nova lox6.0salmon
\n", "
" ], "text/plain": [ " food ounces animal\n", "0 bacon 4.0 pig\n", "1 pulled pork 3.0 pig\n", "2 bacon 12.0 pig\n", "3 Pastrami 6.0 cow\n", "4 corned beef 7.5 cow\n", "5 Bacon 8.0 pig\n", "6 pastrami 3.0 cow\n", "7 honey ham 5.0 pig\n", "8 nova lox 6.0 salmon" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[\"animal\"] = lowered.map(meat_to_animal)\n", "data" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1.0\n", "1 -999.0\n", "2 2.0\n", "3 -999.0\n", "4 -1000.0\n", "5 3.0\n", "dtype: float64" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.Series([1., -999., 2., -999., -1000., 3.])\n", "data" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1.0\n", "1 NaN\n", "2 2.0\n", "3 NaN\n", "4 -1000.0\n", "5 3.0\n", "dtype: float64" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.replace(-999, np.nan)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1.0\n", "1 NaN\n", "2 2.0\n", "3 NaN\n", "4 NaN\n", "5 3.0\n", "dtype: float64" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.replace([-999, -1000], np.nan)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1.0\n", "1 NaN\n", "2 2.0\n", "3 NaN\n", "4 0.0\n", "5 3.0\n", "dtype: float64" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.replace({-999 : np.nan, \n", " -1000: 0})" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
ohio1234
colorado5678
ny9101112
\n", "
" ], "text/plain": [ " one two three four\n", "ohio 1 2 3 4\n", "colorado 5 6 7 8\n", "ny 9 10 11 12" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(np.arange(start = 1, stop = 13, step = 1).reshape(3, 4))\n", "df.index = [\"ohio\", \"colorado\", \"ny\"]\n", "df.columns = [\"one\", \"two\", \"three\", \"four\"]\n", "df" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "to_upper = lambda x: x.upper()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['OHIO', 'COLORADO', 'NY'], dtype='object')" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.index.map(to_upper)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
onetwothreefour
OHIO1234
COLORADO5678
NY9101112
\n", "
" ], "text/plain": [ " one two three four\n", "OHIO 1 2 3 4\n", "COLORADO 5 6 7 8\n", "NY 9 10 11 12" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.index = df.index.map(to_upper)\n", "df" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OneTwoThreeFour
OHIO1234
COLORADO5678
NY9101112
\n", "
" ], "text/plain": [ " One Two Three Four\n", "OHIO 1 2 3 4\n", "COLORADO 5 6 7 8\n", "NY 9 10 11 12" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.rename(index = str.upper, columns = str.title)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "age = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]\n", "age" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "bins =[18, 25, 35, 60, 100]" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]\n", "Length: 12\n", "Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cats = pd.cut(age, bins)\n", "cats" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]\n", " closed='right',\n", " dtype='interval[int64]')" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cats.categories" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cats.codes" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(18, 25] 5\n", "(35, 60] 3\n", "(25, 35] 3\n", "(60, 100] 1\n", "dtype: int64" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.value_counts(cats)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
0-1.0608850.257677-0.439057-0.585580
10.9924140.8486330.2794561.804831
2-0.465408-1.5738070.3133310.939957
30.247694-0.232562-0.7634373.137660
4-0.4550831.724922-0.4674470.595734
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "0 -1.060885 0.257677 -0.439057 -0.585580\n", "1 0.992414 0.848633 0.279456 1.804831\n", "2 -0.465408 -1.573807 0.313331 0.939957\n", "3 0.247694 -0.232562 -0.763437 3.137660\n", "4 -0.455083 1.724922 -0.467447 0.595734" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(np.random.randn(1000, 4))\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
count1000.0000001000.0000001000.0000001000.000000
mean0.024221-0.005284-0.0072710.058116
std0.9990470.9815001.0270200.993744
min-3.060966-3.082180-3.907713-3.536450
25%-0.651549-0.667142-0.709838-0.641517
50%0.0286590.023977-0.0286690.048662
75%0.6839960.7252580.6759200.739192
max3.7594362.9947003.2632313.197917
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "count 1000.000000 1000.000000 1000.000000 1000.000000\n", "mean 0.024221 -0.005284 -0.007271 0.058116\n", "std 0.999047 0.981500 1.027020 0.993744\n", "min -3.060966 -3.082180 -3.907713 -3.536450\n", "25% -0.651549 -0.667142 -0.709838 -0.641517\n", "50% 0.028659 0.023977 -0.028669 0.048662\n", "75% 0.683996 0.725258 0.675920 0.739192\n", "max 3.759436 2.994700 3.263231 3.197917" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "145 -3.907713\n", "971 3.263231\n", "Name: 2, dtype: float64" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[2][np.abs(df[2]) > 3]" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
555-0.055178-0.7043771.067797-0.264149
295-0.338811-0.589736-0.562751-0.357064
3290.6341840.472945-0.562836-0.461018
727-0.735358-2.652772-2.1114810.254709
4841.1339320.7708411.053564-1.719157
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "555 -0.055178 -0.704377 1.067797 -0.264149\n", "295 -0.338811 -0.589736 -0.562751 -0.357064\n", "329 0.634184 0.472945 -0.562836 -0.461018\n", "727 -0.735358 -2.652772 -2.111481 0.254709\n", "484 1.133932 0.770841 1.053564 -1.719157" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sample(frac = 0.75).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## String manipulation" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'a, b, guide'" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val = \"a, b, guide\"\n", "val" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['a', ' b', ' guide']" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val.split(\",\")" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['a', 'b', 'guide']" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pieces = [x.strip() for x in val.split(\",\")]\n", "pieces" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'a__b__guide'" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"__\".join(pieces)" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "val.count(\"a\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Regular expressions\n", "Flexible way to search or match string patterns" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [], "source": [ "import re" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'foo bar\\t baz \\tqux'" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text = \"foo bar\\t baz \\tqux\"\n", "text" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['foo', 'bar', 'baz', 'qux']" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re.split(\"\\s+\", text)" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [], "source": [ "regex = re.compile(\"\\s+\")" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['foo', 'bar', 'baz', 'qux']" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "regex.split(text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }