{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "import numpy as np\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Lecture 9 ##" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Histogram Review ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#sleep = On average, how many hours of sleep do you get each night?\n", "#tv = During a typical week, how many hours do you spend watching television?\n", "#number = Pick a number between 0 and 9.\n", "#study = During a typical week, how many hours do you spend studying?\n", "#snow_white = Which of the Seven Dwarfs from Snow White are you most like?\n", "survey = Table.read_table('survey09.csv')\n", "survey.show(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "survey.hist('number', bins=np.arange(0, 9))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "study = survey.column(\"studying\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "[min(study), max(study)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "survey.hist('studying', bins=np.arange(0, 81, 10))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "survey.hist('studying', bins=np.arange(0, 51, 5))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "survey.hist('studying', bins=np.arange(0, 51, 2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "survey.hist('tv', bins=15)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "survey.hist('sleep', bins=5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "seven = survey.group('snow_white')\n", "seven.barh('snow_white')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "galton = Table.read_table('galton.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Each row corresponds to one adult child\n", "#family = family indicator\n", "#father height (inches) \n", "#mother height (inches) \n", "#\"midparent height\"= weighted average of parents' heights\n", "#children= # of children in the family\n", "#childNum = child's birth rank (1 = oldest)\n", "#gender\n", "#height (inches)\n", "galton" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights = galton.select(3, 7).relabeled(0, 'MidParent').relabeled(1, 'Child')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "my_bins = np.arange(55, 80, 2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Histogram of child heights\n", "heights.hist('Child', bins = my_bins, unit='Inch')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#Percentage of heights between 65 and 67\n", "heights.where('Child', are.between(65, 67)).num_rows / heights.num_rows" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Histogram of parent heights\n", "heights.hist('MidParent', bins=my_bins, unit='inch')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#Combined histogram\n", "heights.hist(bins=my_bins, unit='inch')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Functions ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def double(x):\n", " return x * 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "double(7)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "double(15/3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "my_number = 12" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "double(my_number)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "double(my_number / 8)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "double(make_array(3, 4, 5))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "double('data')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#\"local scope\"\n", "x" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x = 17" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "double(2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "double(x)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Discussion Question" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#What does this function do?\n", "def percents(values):\n", " return np.round(values / sum(values) * 100, 2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "percents(make_array(1, 2, 3, 4))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "percents(make_array(1, 4, 30))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Can have multiple inputs\n", "def percents(values, places):\n", " return np.round(values / sum(values) * 100, places)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "percents(make_array(1, 4, 30), 1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Apply ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "ages = Table().with_columns(\n", " 'Person', make_array('A', 'B', 'C', 'D'),\n", " 'Age', make_array(63, 110, 99, 102)\n", ")\n", "ages" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def cut_off_at_100(z):\n", " return min(z, 100)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cut_off_at_100(3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cut_off_at_100(107)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cut_age_array = ages.apply(cut_off_at_100, 'Age')\n", "cut_age_array" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ages.with_column('Cut off ages', cut_age_array)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "type(cut_off_at_100)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prediction ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights.scatter('MidParent', 'Child')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights.scatter('MidParent', 'Child')\n", "plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)\n", "plots.plot([68.5, 68.5], [50, 85], color='red', lw=2);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nearby = heights.where('MidParent', are.between(67.5, 68.5))\n", "nearby.column('Child').mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights.scatter('MidParent', 'Child')\n", "plots.plot([67.5, 67.5], [50, 85], color='red', lw=2)\n", "plots.plot([68.5, 68.5], [50, 85], color='red', lw=2)\n", "plots.scatter(68, 66.24, color='gold', s=75);" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def predict_child(h):\n", " nearby = heights.where('MidParent', are.between(h-0.5, h+0.5))\n", " return nearby.column('Child').mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predict_child(68)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predict_child(65)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictions = heights.apply(predict_child, 'MidParent')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights = heights.with_column('Child Prediction', predictions)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "heights" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "heights.scatter('MidParent')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 1 }