{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "import numpy as np\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lecture 10 ##" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prediction ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "galton = Table.read_table('galton.csv')\n", "heights = galton.select(3, 7).relabeled(0, 'MidParent').relabeled(1, 'Child')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights.scatter('MidParent', 'Child')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights.scatter('MidParent', 'Child')\n", "plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)\n", "plots.plot([68.5, 68.5], [50, 85], color='red', lw=2);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nearby = heights.where('MidParent', are.between(67.5, 68.5))\n", "nearby.column('Child').mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights.scatter('MidParent', 'Child')\n", "plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)\n", "plots.plot([68.5, 68.5], [50, 85], color='red', lw=2)\n", "plots.scatter(68, 66.24, color='gold', s=50);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def predict_child(h):\n", " nearby = heights.where('MidParent', are.between(h - 0.5, h + 0.5))\n", " return nearby.column('Child').mean()\n", "\n", "predict_child(68)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predict_child(70)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predict_child(72)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predicted = heights.apply(predict_child, 'MidParent')\n", "predicted" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights = heights.with_column('Predicted child', predicted)\n", "heights" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights.scatter('MidParent')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Apply with Multiple Columns ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "temperatures = Table.read_table('temperatures.csv')\n", "temperatures" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "temperatures.plot('Day')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "temperatures.select('Low', 'High').hist(bins=np.arange(30, 105, 5))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "temperatures.scatter('Low', 'High')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Difference between high temp and low temp\n", "def difference(x, y):\n", " return x-y\n", "\n", "difference(65, 54)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "daily_spread = temperatures.apply(difference, 'High', 'Low')\n", "temperatures = temperatures.with_column('Spread', daily_spread)\n", "temperatures" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "temperatures.hist('Spread', bins=np.arange(0, 40, 4))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "temperatures.where('Spread', are.above(20)).num_rows / temperatures.num_rows" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "Function with Optional Arguments " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def percents(s, places):\n", " return np.round(s/sum(s) * 100, places)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x = make_array(2, 5, 16)\n", "percents(x, 4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def percents(s, places=2):\n", " return np.round(s/sum(s) * 100, places)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "percents(x)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Grouping by Category ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones = Table.read_table('cones.csv')\n", "all_cones" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cones = all_cones.drop('Color').exclude(5)\n", "cones" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cones.group('Flavor')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cones.group('Flavor', min)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cones.group('Flavor', list)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cones.group('Flavor', np.average)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Can get min price using .group\n", "cones.group('Flavor', min)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Or by computing using other table methods:\n", "min(cones.where('Flavor', 'chocolate').column('Price'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#User-defined function\n", "def spread(arr):\n", " return max(arr) - min(arr)\n", "\n", "spread(make_array(7, 10, 2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Use your own function in .group\n", "cones.group('Flavor', spread)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Grouping by multiple columns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cones" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones.group(['Flavor', 'Color'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones.group(['Flavor', 'Color'], np.average)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Examples" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nba = Table.read_table('nba_salaries.csv').relabeled(3, 'SALARY')\n", "nba" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# total salary paid by each team, highest first\n", "\n", "nba.select('TEAM', 'SALARY').group('TEAM', sum).sort('SALARY sum', descending=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nba.group('TEAM', sum)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# average salary paid for each position\n", "\n", "nba.select('POSITION', 'SALARY').group('POSITION', np.average)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# for each team, average salary paid for each position\n", "\n", "nba.drop('PLAYER').group(['TEAM', 'POSITION'], np.average)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pivot Tables ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "all_cones.group(['Flavor', 'Color'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "all_cones.pivot('Flavor', 'Color')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones.pivot('Flavor', 'Color', values='Price', collect=np.average)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones.pivot('Flavor', 'Color', values='Price', collect=list)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Examples" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "survey = Table.read_table('survey10.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "survey.show(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "survey.pivot('love_at_first_sight', 'year')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "survey.pivot('love_at_first_sight', 'super_power',values=\"number\",collect=np.mean)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Challenge Question" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Which NBA teams spent the most on their “starters” in 2015-2016?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nba" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Let's first look at average salary per team per position\n", "nba.pivot('POSITION', 'TEAM', values = 'SALARY', collect = np.average)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#If we assume the \"starter\" is the player paid the most in each position, \n", "#we can use \"collect = max\"\n", "nba_starters_salaries = nba.pivot('POSITION', 'TEAM', values = 'SALARY', collect = max)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Now we need to find the total paid for the starters\n", "totals = nba_starters_salaries.drop(\"TEAM\").apply(sum)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Add \"totals\" to our \"nba_starters_salaries\" table and sort by total\n", "nba_starters_salaries.with_column(\"TOTAL\", totals).sort(\"TOTAL\",descending = True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 1 }