{ "metadata": { "name": "", "signature": "sha256:a318835b4224090dfb7278ec7e82129b5966061077556fe99ce04391d435ef3f" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import numpy as np\n", "import pandas as pd\n", "from pandas import Series, DataFrame" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": [ "# Let's see how we would find outliers in a dataset\n", "\n", "# First we'll seed the numpy generator\n", "np.random.seed(12345)\n", "\n", "#Next we'll create the dataframe\n", "dframe = DataFrame(np.random.randn(1000,4))" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [ "#Show preview\n", "dframe.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
0-0.204708 0.478943-0.519439-0.555730
1 1.965781 1.393406 0.092908 0.281746
2 0.769023 1.246435 1.007189-1.296221
3 0.274992 0.228913 1.352917 0.886429
4-2.001637-0.371843 1.669025-0.438570
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 25, "text": [ " 0 1 2 3\n", "0 -0.204708 0.478943 -0.519439 -0.555730\n", "1 1.965781 1.393406 0.092908 0.281746\n", "2 0.769023 1.246435 1.007189 -1.296221\n", "3 0.274992 0.228913 1.352917 0.886429\n", "4 -2.001637 -0.371843 1.669025 -0.438570" ] } ], "prompt_number": 25 }, { "cell_type": "code", "collapsed": false, "input": [ "# Lets describe the data\n", "dframe.describe()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.067684 0.067924 0.025598 -0.002298
std 0.998035 0.992106 1.006835 0.996794
min -3.428254 -3.548824 -3.184377 -3.745356
25% -0.774890 -0.591841 -0.641675 -0.644144
50% -0.116401 0.101143 0.002073 -0.013611
75% 0.616366 0.780282 0.680391 0.654328
max 3.366626 2.653656 3.260383 3.927528
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 26, "text": [ " 0 1 2 3\n", "count 1000.000000 1000.000000 1000.000000 1000.000000\n", "mean -0.067684 0.067924 0.025598 -0.002298\n", "std 0.998035 0.992106 1.006835 0.996794\n", "min -3.428254 -3.548824 -3.184377 -3.745356\n", "25% -0.774890 -0.591841 -0.641675 -0.644144\n", "50% -0.116401 0.101143 0.002073 -0.013611\n", "75% 0.616366 0.780282 0.680391 0.654328\n", "max 3.366626 2.653656 3.260383 3.927528" ] } ], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": [ "# Lets select the first column\n", "col = dframe[0]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 27 }, { "cell_type": "code", "collapsed": false, "input": [ "# NOw we can check which values in the column are greater than 3, for instance.\n", "col[np.abs(col)>3]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 28, "text": [ "523 -3.428254\n", "900 3.366626\n", "Name: 0, dtype: float64" ] } ], "prompt_number": 28 }, { "cell_type": "code", "collapsed": false, "input": [ "# So we now know in column[0], rows 523 and 900 have values with abs > 3\n", "\n", "#How about all the columns?\n", "\n", "# We can use the \"any\" method\n", "dframe[(np.abs(dframe)>3).any(1)]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
5 -0.539741 0.476985 3.248944-1.021228
97 -0.774363 0.552936 0.106061 3.927528
102-0.655054-0.565230 3.176873 0.959533
305-2.315555 0.457246-0.025907-3.399312
324 0.050188 1.951312 3.260383 0.963301
400 0.146326 0.508391-0.196713-3.745356
499-0.293333-0.242459-3.056990 1.918403
523-3.428254-0.296336-0.439938-0.867165
586 0.275144 1.179227-3.184377 1.369891
808-0.362528-3.548824 1.553205-2.186301
900 3.366626-2.372214 0.851010 1.332846
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 29, "text": [ " 0 1 2 3\n", "5 -0.539741 0.476985 3.248944 -1.021228\n", "97 -0.774363 0.552936 0.106061 3.927528\n", "102 -0.655054 -0.565230 3.176873 0.959533\n", "305 -2.315555 0.457246 -0.025907 -3.399312\n", "324 0.050188 1.951312 3.260383 0.963301\n", "400 0.146326 0.508391 -0.196713 -3.745356\n", "499 -0.293333 -0.242459 -3.056990 1.918403\n", "523 -3.428254 -0.296336 -0.439938 -0.867165\n", "586 0.275144 1.179227 -3.184377 1.369891\n", "808 -0.362528 -3.548824 1.553205 -2.186301\n", "900 3.366626 -2.372214 0.851010 1.332846" ] } ], "prompt_number": 29 }, { "cell_type": "code", "collapsed": false, "input": [ "# WE could also possibly cap the data at 3\n", "\n", "dframe[np.abs(dframe)>3] = np.sign(dframe) *3" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 33 }, { "cell_type": "code", "collapsed": false, "input": [ "dframe.describe()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
count 1000.000000 1000.000000 1000.000000 1000.000000
mean -0.061623 0.074473 0.037153 0.009919
std 0.995875 0.989820 1.003604 0.989688
min -2.969411 -2.989741 -2.925113 -2.881858
25% -0.774132 -0.588138 -0.622310 -0.636641
50% -0.115171 0.102787 0.012889 -0.010997
75% 0.619779 0.787953 0.682401 0.659019
max 3.000000 3.000000 3.000000 3.000000
\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 34, "text": [ " 0 1 2 3\n", "count 1000.000000 1000.000000 1000.000000 1000.000000\n", "mean -0.061623 0.074473 0.037153 0.009919\n", "std 0.995875 0.989820 1.003604 0.989688\n", "min -2.969411 -2.989741 -2.925113 -2.881858\n", "25% -0.774132 -0.588138 -0.622310 -0.636641\n", "50% -0.115171 0.102787 0.012889 -0.010997\n", "75% 0.619779 0.787953 0.682401 0.659019\n", "max 3.000000 3.000000 3.000000 3.000000" ] } ], "prompt_number": 34 }, { "cell_type": "code", "collapsed": false, "input": [ "# Next we'll learn about Permutation!" ], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }