{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datascience import *\n", "import numpy as np\n", "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plots\n", "plots.style.use('fivethirtyeight')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pivot Tables ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones = Table.read_table('cones.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones.group(['Flavor', 'Color'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones.pivot('Flavor', 'Color')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones.pivot('Color', 'Flavor')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_cones.pivot('Color', 'Flavor', values = 'Price', collect = list)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Pivot Examples ##" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nba = Table.read_table('nba_salaries.csv').relabeled(\"'15-'16 SALARY\", 'SALARY')\n", "nba" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# for each team, average salary paid for each position\n", "\n", "nba.pivot('POSITION', 'TEAM', values = 'SALARY', collect = np.average)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# for each team,\n", "# amount paid to \"starter\" (player earning the most) in each position\n", "\n", "starters = nba.pivot('POSITION', 'TEAM', values = 'SALARY', collect = max)\n", "starters" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# sort teams by total amount paid to starters\n", "\n", "totals = starters.drop('TEAM').apply(sum)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "starters.with_column('TOTAL', totals).sort('TOTAL', descending=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Join Examples" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Make table 1\n", "drinks = Table(['Drink', 'Cafe', 'Price']).with_rows([\n", " ['Milk Tea', 'Tea One', 4],\n", " ['Espresso', 'Nefeli', 2],\n", " ['Latte', 'Nefeli', 3],\n", " ['Espresso', \"Abe's\", 2]\n", "])\n", "drinks" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Make table 2\n", "discounts = Table().with_columns(\n", " 'Coupon % off', make_array(25, 50, 5),\n", " 'Location', make_array('Tea One', 'Nefeli', 'Tea One')\n", ")\n", "discounts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "combined = drinks.join('Cafe', discounts, 'Location')\n", "combined" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "discounted_prices = combined.column('Price') * (1 - combined.column('Coupon % off')/ 100)\n", "combined.with_column('Discounted', discounted_prices)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "drinks" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Join table with itself\n", "two = drinks.join('Cafe', drinks)\n", "two" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "two.with_column('Total', two.column('Price') + two.column('Price_2'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Join Examples: Tennis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There was recently discussion about whether women tennis players get penalized more than men, for coaching violations. Let's look at some data." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Data on number of code violations at tennis Grand Slam tournaments, 1998-2018\n", "men_violations = Table.read_table('tennis_men_penalties.csv')\n", "women_violations = Table.read_table('tennis_women_penalties.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "men_violations" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "women_violations" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "combined = men_violations.join('Violation', women_violations)\n", "combined" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "combined.barh('Violation')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "men_sets = 460\n", "women_sets = 283\n", "normalized = combined.with_columns(\n", " 'Rate (Men)', combined.column('Count (Men)') / men_sets,\n", " 'Rate (Women)', combined.column('Count (Women)') / women_sets\n", ")\n", "normalized.drop('Count (Men)', 'Count (Women)').barh('Violation')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Bikes" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "###-----Bike rentals in Bay Area from 9/2014-8/2015\n", "#Trip ID = ID for the rental\n", "#Duration = duration of rental (seconds)\n", "#Start date = start date of rental\n", "#Start Station = Name of the Start Station \n", "#Start Terminal = code for Start Terminal\n", "#End Date = end date of rental\n", "#End Station = Name of the End Station \n", "#End Terminal = code for End Terminal\n", "#Bike # = A serial number for the bike\n", "#Subscriber Type = \"Customer\" or \"Subscriber\"\n", "#Zip code \n", "\n", "trips = Table.read_table('trip.csv')\n", "trips" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Trips < 1800 seconds (30 min) are free!\n", "commute = trips.where('Duration', are.below(1800))\n", "commute.hist('Duration')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "commute.hist('Duration', bins=60, unit='second')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Most trips between the 250 and 550 range\n", "#Rough estimate of percent between 250 and 550 minutes\n", "(550-250) * 0.15" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Smaller/more bins, but general shape of the distributions stays the same\n", "commute.hist('Duration', bins=np.arange(1801), unit='second')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#What is the most frequently used start station?\n", "starts = commute.group('Start Station').sort('count', descending=True)\n", "starts" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "San Francisco Caltrain Station on Townsend and 4th - place where people who take the train into the city can grab a bike to get to work, etc." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Pivot can be used to investigate between which stations people are traveling\n", "commute.pivot('Start Station', 'End Station')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Shortest trips" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Look at durations between start and end stations\n", "duration = trips.select('Start Station', 'End Station', 'Duration')\n", "duration" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Use the group method and find the minimum duration\n", "#One trip started and ended at 2nd at Folsom\n", "shortest = duration.group(['Start Station', 'End Station'], min)\n", "shortest" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Civic Center Bart (7th at Market) is a popular location\n", "#We can investigate where people are headed from there, along with the shortest trip duration\n", "from_cc = shortest.where('Start Station', are.containing('Civic Center BART')).sort('Duration min')\n", "from_cc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Maps" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Locations of each bike station\n", "stations = Table.read_table('station.csv').relabel(\"name\",\"labels\")\n", "stations" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#Draw map where stations are located\n", "Marker.map_table(stations.select('lat', 'long', 'labels'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sf = stations.where('landmark', 'San Francisco')\n", "Circle.map_table(sf.select('lat', 'long', 'labels'), color='green', radius=50)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Color the markers according to the landmark\n", "colors = stations.group('landmark').with_column(\n", " 'colors', make_array('blue', 'red', 'green', 'orange', 'purple'))\n", "colors" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "stations.join('landmark', colors)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "colored = stations.join('landmark', colors).select('lat', 'long', 'labels', 'colors')\n", "Marker.map_table(colored)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Recall the counts per Start Station\n", "starts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "station_starts = stations.join('labels', starts, 'Start Station')\n", "station_starts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Circle.map_table(station_starts.select('lat', 'long', 'labels').with_columns(\n", " 'colors', 'blue',\n", " 'areas', station_starts.column('count')\n", "))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 2 }