{
"metadata": {
"name": "",
"signature": "sha256:a318835b4224090dfb7278ec7e82129b5966061077556fe99ce04391d435ef3f"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import numpy as np\n",
"import pandas as pd\n",
"from pandas import Series, DataFrame"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 23
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Let's see how we would find outliers in a dataset\n",
"\n",
"# First we'll seed the numpy generator\n",
"np.random.seed(12345)\n",
"\n",
"#Next we'll create the dataframe\n",
"dframe = DataFrame(np.random.randn(1000,4))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 24
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Show preview\n",
"dframe.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" -0.204708 | \n",
" 0.478943 | \n",
" -0.519439 | \n",
" -0.555730 | \n",
"
\n",
" \n",
" 1 | \n",
" 1.965781 | \n",
" 1.393406 | \n",
" 0.092908 | \n",
" 0.281746 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.769023 | \n",
" 1.246435 | \n",
" 1.007189 | \n",
" -1.296221 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.274992 | \n",
" 0.228913 | \n",
" 1.352917 | \n",
" 0.886429 | \n",
"
\n",
" \n",
" 4 | \n",
" -2.001637 | \n",
" -0.371843 | \n",
" 1.669025 | \n",
" -0.438570 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 25,
"text": [
" 0 1 2 3\n",
"0 -0.204708 0.478943 -0.519439 -0.555730\n",
"1 1.965781 1.393406 0.092908 0.281746\n",
"2 0.769023 1.246435 1.007189 -1.296221\n",
"3 0.274992 0.228913 1.352917 0.886429\n",
"4 -2.001637 -0.371843 1.669025 -0.438570"
]
}
],
"prompt_number": 25
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Lets describe the data\n",
"dframe.describe()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 1000.000000 | \n",
" 1000.000000 | \n",
" 1000.000000 | \n",
" 1000.000000 | \n",
"
\n",
" \n",
" mean | \n",
" -0.067684 | \n",
" 0.067924 | \n",
" 0.025598 | \n",
" -0.002298 | \n",
"
\n",
" \n",
" std | \n",
" 0.998035 | \n",
" 0.992106 | \n",
" 1.006835 | \n",
" 0.996794 | \n",
"
\n",
" \n",
" min | \n",
" -3.428254 | \n",
" -3.548824 | \n",
" -3.184377 | \n",
" -3.745356 | \n",
"
\n",
" \n",
" 25% | \n",
" -0.774890 | \n",
" -0.591841 | \n",
" -0.641675 | \n",
" -0.644144 | \n",
"
\n",
" \n",
" 50% | \n",
" -0.116401 | \n",
" 0.101143 | \n",
" 0.002073 | \n",
" -0.013611 | \n",
"
\n",
" \n",
" 75% | \n",
" 0.616366 | \n",
" 0.780282 | \n",
" 0.680391 | \n",
" 0.654328 | \n",
"
\n",
" \n",
" max | \n",
" 3.366626 | \n",
" 2.653656 | \n",
" 3.260383 | \n",
" 3.927528 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 26,
"text": [
" 0 1 2 3\n",
"count 1000.000000 1000.000000 1000.000000 1000.000000\n",
"mean -0.067684 0.067924 0.025598 -0.002298\n",
"std 0.998035 0.992106 1.006835 0.996794\n",
"min -3.428254 -3.548824 -3.184377 -3.745356\n",
"25% -0.774890 -0.591841 -0.641675 -0.644144\n",
"50% -0.116401 0.101143 0.002073 -0.013611\n",
"75% 0.616366 0.780282 0.680391 0.654328\n",
"max 3.366626 2.653656 3.260383 3.927528"
]
}
],
"prompt_number": 26
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Lets select the first column\n",
"col = dframe[0]"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 27
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# NOw we can check which values in the column are greater than 3, for instance.\n",
"col[np.abs(col)>3]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 28,
"text": [
"523 -3.428254\n",
"900 3.366626\n",
"Name: 0, dtype: float64"
]
}
],
"prompt_number": 28
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# So we now know in column[0], rows 523 and 900 have values with abs > 3\n",
"\n",
"#How about all the columns?\n",
"\n",
"# We can use the \"any\" method\n",
"dframe[(np.abs(dframe)>3).any(1)]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" \n",
" \n",
" 5 | \n",
" -0.539741 | \n",
" 0.476985 | \n",
" 3.248944 | \n",
" -1.021228 | \n",
"
\n",
" \n",
" 97 | \n",
" -0.774363 | \n",
" 0.552936 | \n",
" 0.106061 | \n",
" 3.927528 | \n",
"
\n",
" \n",
" 102 | \n",
" -0.655054 | \n",
" -0.565230 | \n",
" 3.176873 | \n",
" 0.959533 | \n",
"
\n",
" \n",
" 305 | \n",
" -2.315555 | \n",
" 0.457246 | \n",
" -0.025907 | \n",
" -3.399312 | \n",
"
\n",
" \n",
" 324 | \n",
" 0.050188 | \n",
" 1.951312 | \n",
" 3.260383 | \n",
" 0.963301 | \n",
"
\n",
" \n",
" 400 | \n",
" 0.146326 | \n",
" 0.508391 | \n",
" -0.196713 | \n",
" -3.745356 | \n",
"
\n",
" \n",
" 499 | \n",
" -0.293333 | \n",
" -0.242459 | \n",
" -3.056990 | \n",
" 1.918403 | \n",
"
\n",
" \n",
" 523 | \n",
" -3.428254 | \n",
" -0.296336 | \n",
" -0.439938 | \n",
" -0.867165 | \n",
"
\n",
" \n",
" 586 | \n",
" 0.275144 | \n",
" 1.179227 | \n",
" -3.184377 | \n",
" 1.369891 | \n",
"
\n",
" \n",
" 808 | \n",
" -0.362528 | \n",
" -3.548824 | \n",
" 1.553205 | \n",
" -2.186301 | \n",
"
\n",
" \n",
" 900 | \n",
" 3.366626 | \n",
" -2.372214 | \n",
" 0.851010 | \n",
" 1.332846 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 29,
"text": [
" 0 1 2 3\n",
"5 -0.539741 0.476985 3.248944 -1.021228\n",
"97 -0.774363 0.552936 0.106061 3.927528\n",
"102 -0.655054 -0.565230 3.176873 0.959533\n",
"305 -2.315555 0.457246 -0.025907 -3.399312\n",
"324 0.050188 1.951312 3.260383 0.963301\n",
"400 0.146326 0.508391 -0.196713 -3.745356\n",
"499 -0.293333 -0.242459 -3.056990 1.918403\n",
"523 -3.428254 -0.296336 -0.439938 -0.867165\n",
"586 0.275144 1.179227 -3.184377 1.369891\n",
"808 -0.362528 -3.548824 1.553205 -2.186301\n",
"900 3.366626 -2.372214 0.851010 1.332846"
]
}
],
"prompt_number": 29
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# WE could also possibly cap the data at 3\n",
"\n",
"dframe[np.abs(dframe)>3] = np.sign(dframe) *3"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 33
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"dframe.describe()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 1000.000000 | \n",
" 1000.000000 | \n",
" 1000.000000 | \n",
" 1000.000000 | \n",
"
\n",
" \n",
" mean | \n",
" -0.061623 | \n",
" 0.074473 | \n",
" 0.037153 | \n",
" 0.009919 | \n",
"
\n",
" \n",
" std | \n",
" 0.995875 | \n",
" 0.989820 | \n",
" 1.003604 | \n",
" 0.989688 | \n",
"
\n",
" \n",
" min | \n",
" -2.969411 | \n",
" -2.989741 | \n",
" -2.925113 | \n",
" -2.881858 | \n",
"
\n",
" \n",
" 25% | \n",
" -0.774132 | \n",
" -0.588138 | \n",
" -0.622310 | \n",
" -0.636641 | \n",
"
\n",
" \n",
" 50% | \n",
" -0.115171 | \n",
" 0.102787 | \n",
" 0.012889 | \n",
" -0.010997 | \n",
"
\n",
" \n",
" 75% | \n",
" 0.619779 | \n",
" 0.787953 | \n",
" 0.682401 | \n",
" 0.659019 | \n",
"
\n",
" \n",
" max | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 3.000000 | \n",
" 3.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 34,
"text": [
" 0 1 2 3\n",
"count 1000.000000 1000.000000 1000.000000 1000.000000\n",
"mean -0.061623 0.074473 0.037153 0.009919\n",
"std 0.995875 0.989820 1.003604 0.989688\n",
"min -2.969411 -2.989741 -2.925113 -2.881858\n",
"25% -0.774132 -0.588138 -0.622310 -0.636641\n",
"50% -0.115171 0.102787 0.012889 -0.010997\n",
"75% 0.619779 0.787953 0.682401 0.659019\n",
"max 3.000000 3.000000 3.000000 3.000000"
]
}
],
"prompt_number": 34
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Next we'll learn about Permutation!"
],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}