{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "import numpy as np\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pivot Tables ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones = Table.read_table('cones.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones.group(['Flavor', 'Color'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones.pivot('Flavor', 'Color')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones.pivot('Color', 'Flavor')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones.pivot('Color', 'Flavor', values = 'Price', collect = list)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pivot Examples ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nba = Table.read_table('nba_salaries.csv').relabeled(\"'15-'16 SALARY\", 'SALARY')\n", "nba" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# for each team, average salary paid for each position\n", "\n", "nba.pivot('POSITION', 'TEAM', values = 'SALARY', collect = np.average)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# for each team,\n", "# amount paid to \"starter\" (player earning the most) in each position\n", "\n", "starters = nba.pivot('POSITION', 'TEAM', values = 'SALARY', collect = max)\n", "starters" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# sort teams by total amount paid to starters\n", "\n", "totals = starters.drop('TEAM').apply(sum)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "starters.with_column('TOTAL', totals).sort('TOTAL', descending=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Join Examples" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "drinks = Table(['Drink', 'Cafe', 'Price']).with_rows([\n", " ['Milk Tea', 'Tea One', 4],\n", " ['Espresso', 'Nefeli', 2],\n", " ['Latte', 'Nefeli', 3],\n", " ['Espresso', \"Abe's\", 2]\n", "])\n", "drinks" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "discounts = Table().with_columns(\n", " 'Coupon % off', make_array(25, 50, 5),\n", " 'Location', make_array('Tea One', 'Nefeli', 'Tea One')\n", ")\n", "discounts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "combined = drinks.join('Cafe', discounts, 'Location')\n", "combined" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "discounted_prices = combined.column('Price') * (1 - combined.column('Coupon % off')/ 100)\n", "combined.with_column('Discounted', discounted_prices)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "drinks" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "two = drinks.join('Cafe', drinks)\n", "two" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "two.with_column('Total', two.column('Price') + two.column('Price_2'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Join Examples: Tennis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There was recently discussion about whether women tennis players get penalized more than men, for coaching violations. Let's look at some data." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Data on number of code violations at tennis Grand Slam tournaments, 1998-2018\n", "men_violations = Table.read_table('tennis_men_penalties.csv')\n", "women_violations = Table.read_table('tennis_women_penalties.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "men_violations" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "women_violations" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "combined = men_violations.join('Violation', women_violations)\n", "combined" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "combined.barh('Violation')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "men_sets = 460\n", "women_sets = 283\n", "normalized = combined.with_columns(\n", " 'Rate (Men)', combined.column('Count (Men)') / men_sets,\n", " 'Rate (Women)', combined.column('Count (Women)') / women_sets\n", ")\n", "normalized.drop('Count (Men)', 'Count (Women)').barh('Violation')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "While we're on the subject of tennis, You can view stats on the top men players, sorted by how often they win when they are serving -- see [here](https://www.atpworldtour.com/en/stats/service-games-won/2018/all/all/). The rankings of top 10 players are available [here](https://www.atpworldtour.com/en/rankings/singles). They can be converted these into a spreadsheet using [a handy web service](https://www.import.io/)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_men = Table.read_table('tennis_men_ranks.csv')\n", "men_serving = Table.read_table('tennis_men_serving.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_men" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "men_serving" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "men2 = top_men.join('Name', men_serving.select('Name', 'Percentage'))\n", "men2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "height = Table.read_table('tennis_men_heights.csv')\n", "height" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "men3 = men2.join('Name', height)\n", "men3" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "men3.scatter('Height', 'Percentage')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Bikes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trips = Table.read_table('trip.csv')\n", "trips" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "commute = trips.where('Duration', are.below(1800))\n", "commute.hist('Duration')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "commute.hist('Duration', bins=60, unit='second')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "(550-250) * 0.15" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "commute.hist('Duration', bins=np.arange(1801), unit='second')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "starts = commute.group('Start Station').sort('count', descending=True)\n", "starts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "commute.pivot('Start Station', 'End Station')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "duration = trips.select('Start Station', 'End Station', 'Duration')\n", "duration" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "shortest = duration.group(['Start Station', 'End Station'], min)\n", "shortest" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from_cc = shortest.where('Start Station', are.containing('Civic Center BART')).sort('Duration min')\n", "from_cc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Maps" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "stations = Table.read_table('station.csv')\n", "stations" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "Marker.map_table(stations.select('lat', 'long', 'name'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sf = stations.where('landmark', 'San Francisco')\n", "Circle.map_table(sf.select('lat', 'long', 'name'), color='green', radius=150)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from_cc" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "colors = stations.group('landmark').with_column(\n", " 'color', make_array('blue', 'red', 'green', 'orange', 'purple'))\n", "colors" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "colored = stations.join('landmark', colors).select('lat', 'long', 'name', 'color')\n", "Marker.map_table(colored)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "station_starts = stations.join('name', starts, 'Start Station')\n", "station_starts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Circle.map_table(station_starts.select('lat', 'long', 'name').with_columns(\n", " 'color', 'blue',\n", " 'area', station_starts.column('count') * 1000\n", "))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 2 }