{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import pandas as pd\n", "import numpy as np" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 31 }, { "cell_type": "code", "collapsed": false, "input": [ "# Sample Data about student test scores from two classes\n", "class_data = [{ 'student': 'AJ',\n", " 'class': 'A',\n", " 'score': 9\n", " },\n", " { 'student': 'Paul',\n", " 'class': 'A',\n", " 'score': 8\n", " },\n", " { 'student': 'Raymond',\n", " 'class': 'A',\n", " 'score': 7\n", " },\n", " { 'student': 'Jenny',\n", " 'class': 'B',\n", " 'score': 5\n", " },\n", " { 'student': 'Pete',\n", " 'class': 'B',\n", " 'score': 4\n", " },\n", " { 'student': 'Colin',\n", " 'class': 'B',\n", " 'score': 6\n", " },\n", " { 'student': 'Sarah',\n", " 'class': 'B',\n", " 'score': 4\n", " }]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "df = pd.DataFrame(class_data)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "# What does the dataframe look like?\n", "df" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
classscorestudent
0 A 9 AJ
1 A 8 Paul
2 A 7 Raymond
3 B 5 Jenny
4 B 4 Pete
5 B 6 Colin
6 B 4 Sarah
\n", "

7 rows \u00d7 3 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 67, "text": [ " class score student\n", "0 A 9 AJ\n", "1 A 8 Paul\n", "2 A 7 Raymond\n", "3 B 5 Jenny\n", "4 B 4 Pete\n", "5 B 6 Colin\n", "6 B 4 Sarah\n", "\n", "[7 rows x 3 columns]" ] } ], "prompt_number": 67 }, { "cell_type": "code", "collapsed": false, "input": [ "# the mean score for all students\n", "df['score'].mean()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 68, "text": [ "6.1428571428571432" ] } ], "prompt_number": 68 }, { "cell_type": "code", "collapsed": false, "input": [ "# Use a boolean index to look at students from only class A\n", "df['class'] == 'A'" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ "0 True\n", "1 True\n", "2 True\n", "3 False\n", "4 False\n", "5 False\n", "6 False\n", "Name: class, dtype: bool" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "df[df['class'] == 'A']" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
classscorestudent
0 A 9 AJ
1 A 8 Paul
2 A 7 Raymond
\n", "

3 rows \u00d7 3 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 21, "text": [ " class score student\n", "0 A 9 AJ\n", "1 A 8 Paul\n", "2 A 7 Raymond\n", "\n", "[3 rows x 3 columns]" ] } ], "prompt_number": 21 }, { "cell_type": "code", "collapsed": false, "input": [ "# We can do a similar operation grouping by 'class'\n", "group = df.groupby(by='class')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "# This returns a DataFrameGroupBy object\n", "type(group)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 26, "text": [ "pandas.core.groupby.DataFrameGroupBy" ] } ], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": [ "# We can get the mean from this object\n", "class_mean = group.mean()\n", "\n", "# And this returns a dataframe\n", "type(class_mean)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 69, "text": [ "pandas.core.frame.DataFrame" ] } ], "prompt_number": 69 }, { "cell_type": "code", "collapsed": false, "input": [ "class_mean" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
score
class
A 8.00
B 4.75
\n", "

2 rows \u00d7 1 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 70, "text": [ " score\n", "class \n", "A 8.00\n", "B 4.75\n", "\n", "[2 rows x 1 columns]" ] } ], "prompt_number": 70 }, { "cell_type": "code", "collapsed": false, "input": [ "# Next, we can specify a column and use .aggregate to perform\n", "# multiple calculations on one column\n", "class_info = group['score'].aggregate({\n", " 'sum': np.sum,\n", " 'mean': np.mean,\n", " 'std': np.std\n", " })\n", "\n", "# this returns a dataframe\n", "type(class_info)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 71, "text": [ "pandas.core.frame.DataFrame" ] } ], "prompt_number": 71 }, { "cell_type": "code", "collapsed": false, "input": [ "class_info" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stdsummean
class
A 1.000000 24 8.00
B 0.957427 19 4.75
\n", "

2 rows \u00d7 3 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 72, "text": [ " std sum mean\n", "class \n", "A 1.000000 24 8.00\n", "B 0.957427 19 4.75\n", "\n", "[2 rows x 3 columns]" ] } ], "prompt_number": 72 } ], "metadata": {} } ] }