{ "metadata": { "name": "", "signature": "sha256:73eb77cd34995593495a287468f1aa0e702a7da42250f6b0344998059b8a0401" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Dealing With Outliers In Pandas\n", "\n", "- **Author:** [Chris Albon](http://www.chrisalbon.com/), [@ChrisAlbon](https://twitter.com/chrisalbon)\n", "- **Date:** -\n", "- **Repo:** [Python 3 code snippets for data science](https://github.com/chrisalbon/code_py)\n", "- **Note:**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### import modules" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import numpy as np" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 7 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create dataframe" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.DataFrame(np.random.randn(20, 5))\n", "df.columns = ['score_1', 'score_2', 'score_3', 'score_4', 'score_5']\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
score_1score_2score_3score_4score_5
0 0.809013-0.043927 0.928440-1.110457 1.623501
1 1.626228-0.304158 1.249759-1.851839 1.081616
2 1.894077 1.068677-0.111965-1.064986 0.035659
3 -0.386469-1.188364 0.831405-0.014511-0.260195
4 -1.242064 0.294049 0.714657-0.396795-1.219513
5 0.852891 0.355283 0.839451 0.746963-0.715827
6 0.191754-0.244156-0.238739 0.797857-1.355429
7 0.175954-0.465887 1.882503-0.174788 0.646117
8 -0.404646-0.755381 0.419163 0.918593 0.923306
9 -0.108578 0.166226 0.890846-0.016745-1.375534
10 0.101022-0.132286 0.274950-0.678942 0.053938
11 1.673355-0.164933 1.086568-1.621484-0.135308
12 1.128543 0.355407-1.380984 0.604208-1.095205
13-1.602945 0.614549-0.089838 0.652979 1.721376
14-1.272730-0.916772-0.594153 0.123623-0.655120
15 0.140682-0.364991-0.522412 0.863911 1.106638
16-0.265389-0.293563 1.066478-0.485762-2.222239
17 2.465491-0.437448-1.577115 0.243174-0.186260
18 0.927383-0.615659-0.075537 0.939576-0.662184
19-0.426472 0.990325 0.314062-0.678511 0.570545
\n", "

20 rows \u00d7 5 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 14, "text": [ " score_1 score_2 score_3 score_4 score_5\n", "0 0.809013 -0.043927 0.928440 -1.110457 1.623501\n", "1 1.626228 -0.304158 1.249759 -1.851839 1.081616\n", "2 1.894077 1.068677 -0.111965 -1.064986 0.035659\n", "3 -0.386469 -1.188364 0.831405 -0.014511 -0.260195\n", "4 -1.242064 0.294049 0.714657 -0.396795 -1.219513\n", "5 0.852891 0.355283 0.839451 0.746963 -0.715827\n", "6 0.191754 -0.244156 -0.238739 0.797857 -1.355429\n", "7 0.175954 -0.465887 1.882503 -0.174788 0.646117\n", "8 -0.404646 -0.755381 0.419163 0.918593 0.923306\n", "9 -0.108578 0.166226 0.890846 -0.016745 -1.375534\n", "10 0.101022 -0.132286 0.274950 -0.678942 0.053938\n", "11 1.673355 -0.164933 1.086568 -1.621484 -0.135308\n", "12 1.128543 0.355407 -1.380984 0.604208 -1.095205\n", "13 -1.602945 0.614549 -0.089838 0.652979 1.721376\n", "14 -1.272730 -0.916772 -0.594153 0.123623 -0.655120\n", "15 0.140682 -0.364991 -0.522412 0.863911 1.106638\n", "16 -0.265389 -0.293563 1.066478 -0.485762 -2.222239\n", "17 2.465491 -0.437448 -1.577115 0.243174 -0.186260\n", "18 0.927383 -0.615659 -0.075537 0.939576 -0.662184\n", "19 -0.426472 0.990325 0.314062 -0.678511 0.570545\n", "\n", "[20 rows x 5 columns]" ] } ], "prompt_number": 14 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Describe the dataframe" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df.describe()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
score_1score_2score_3score_4score_5
count 20.000000 20.000000 20.000000 20.000000 20.000000
mean 0.313855 -0.104151 0.295377 -0.110197 -0.106006
std 1.101752 0.592899 0.879432 0.860297 1.081506
min -1.602945 -1.188364 -1.577115 -1.851839 -2.222239
25% -0.391013 -0.444558 -0.143658 -0.678619 -0.810672
50% 0.158318 -0.204545 0.366613 -0.015628 -0.160784
75% 0.977673 0.309358 0.900244 0.676475 0.715414
max 2.465491 1.068677 1.882503 0.939576 1.721376
\n", "

8 rows \u00d7 5 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 15, "text": [ " score_1 score_2 score_3 score_4 score_5\n", "count 20.000000 20.000000 20.000000 20.000000 20.000000\n", "mean 0.313855 -0.104151 0.295377 -0.110197 -0.106006\n", "std 1.101752 0.592899 0.879432 0.860297 1.081506\n", "min -1.602945 -1.188364 -1.577115 -1.851839 -2.222239\n", "25% -0.391013 -0.444558 -0.143658 -0.678619 -0.810672\n", "50% 0.158318 -0.204545 0.366613 -0.015628 -0.160784\n", "75% 0.977673 0.309358 0.900244 0.676475 0.715414\n", "max 2.465491 1.068677 1.882503 0.939576 1.721376\n", "\n", "[8 rows x 5 columns]" ] } ], "prompt_number": 15 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Are there any values inthe score_1 above or below 2?" ] }, { "cell_type": "code", "collapsed": false, "input": [ "col = df['score_1']\n", "col[np.abs(col) > 2]" ], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }