{ "metadata": { "name": "", "signature": "sha256:b72347f172084cf17e3050cd20aba0b2911b5fbb555736e3739b483e79a837a3" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Descriptive Statistics For pandas Dataframe\n", "\n", "- **Author:** [Chris Albon](http://www.chrisalbon.com/), [@ChrisAlbon](https://twitter.com/chrisalbon)\n", "- **Date:** -\n", "- **Repo:** [Python 3 code snippets for data science](https://github.com/chrisalbon/code_py)\n", "- **Note:**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### import modules" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 40 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create dataframe" ] }, { "cell_type": "code", "collapsed": false, "input": [ "data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], \n", " 'age': [42, 52, 36, 24, 73], \n", " 'preTestScore': [4, 24, 31, 2, 3],\n", " 'postTestScore': [25, 94, 57, 62, 70]}\n", "df = pd.DataFrame(data, columns = ['name', 'age', 'preTestScore', 'postTestScore'])\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nameagepreTestScorepostTestScore
0 Jason 42 4 25
1 Molly 52 24 94
2 Tina 36 31 57
3 Jake 24 2 62
4 Amy 73 3 70
\n", "

5 rows \u00d7 4 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 41, "text": [ " name age preTestScore postTestScore\n", "0 Jason 42 4 25\n", "1 Molly 52 24 94\n", "2 Tina 36 31 57\n", "3 Jake 24 2 62\n", "4 Amy 73 3 70\n", "\n", "[5 rows x 4 columns]" ] } ], "prompt_number": 41 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### The sum of all the ages" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df['age'].sum()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 42, "text": [ "227" ] } ], "prompt_number": 42 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Mean preTestScore" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df['preTestScore'].mean()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 43, "text": [ "12.800000000000001" ] } ], "prompt_number": 43 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Cumulative sum of preTestScores, moving from the rows from the top" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df['preTestScore'].cumsum()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 44, "text": [ "0 4\n", "1 28\n", "2 59\n", "3 61\n", "4 64\n", "Name: preTestScore, dtype: int64" ] } ], "prompt_number": 44 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Summary statistics on preTestScore" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df['preTestScore'].describe()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 45, "text": [ "count 5.000000\n", "mean 12.800000\n", "std 13.663821\n", "min 2.000000\n", "25% 3.000000\n", "50% 4.000000\n", "75% 24.000000\n", "max 31.000000\n", "Name: preTestScore, dtype: float64" ] } ], "prompt_number": 45 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Count the number of non-NA values" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df['preTestScore'].count()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 46, "text": [ "5" ] } ], "prompt_number": 46 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Minimum value of preTestScore" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df['preTestScore'].min()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 47, "text": [ "2" ] } ], "prompt_number": 47 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Maximum value of preTestScore" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df['preTestScore'].max()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 48, "text": [ "31" ] } ], "prompt_number": 48 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Median value of preTestScore" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df['preTestScore'].median()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 49, "text": [ "4.0" ] } ], "prompt_number": 49 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sample variance of preTestScore values" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df['preTestScore'].var()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 50, "text": [ "186.69999999999999" ] } ], "prompt_number": 50 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sample standard deviation of preTestScore values" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df['preTestScore'].std()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 51, "text": [ "13.663820841916802" ] } ], "prompt_number": 51 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Skewness of preTestScore values" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df['preTestScore'].skew()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 52, "text": [ "0.74334524573267591" ] } ], "prompt_number": 52 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Kurtosis of preTestScore values" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df['preTestScore'].kurt()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 53, "text": [ "-2.4673543738411525" ] } ], "prompt_number": 53 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Correlation Matrix Of Values" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df.corr()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agepreTestScorepostTestScore
age 1.000000-0.105651 0.328852
preTestScore-0.105651 1.000000 0.378039
postTestScore 0.328852 0.378039 1.000000
\n", "

3 rows \u00d7 3 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 54, "text": [ " age preTestScore postTestScore\n", "age 1.000000 -0.105651 0.328852\n", "preTestScore -0.105651 1.000000 0.378039\n", "postTestScore 0.328852 0.378039 1.000000\n", "\n", "[3 rows x 3 columns]" ] } ], "prompt_number": 54 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Covariance Matrix Of Values" ] }, { "cell_type": "code", "collapsed": false, "input": [ "df.cov()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
agepreTestScorepostTestScore
age 340.80 -26.65 151.20
preTestScore -26.65 186.70 128.65
postTestScore 151.20 128.65 620.30
\n", "

3 rows \u00d7 3 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 55, "text": [ " age preTestScore postTestScore\n", "age 340.80 -26.65 151.20\n", "preTestScore -26.65 186.70 128.65\n", "postTestScore 151.20 128.65 620.30\n", "\n", "[3 rows x 3 columns]" ] } ], "prompt_number": 55 } ], "metadata": {} } ] }