{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datascience import *\n",
    "import numpy as np\n",
    "\n",
    "%matplotlib inline\n",
    "import matplotlib.pyplot as plots\n",
    "plots.style.use('fivethirtyeight')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pivot Tables ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_cones = Table.read_table('cones.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_cones"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_cones.group(['Flavor', 'Color'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_cones.pivot('Flavor', 'Color')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_cones.pivot('Color', 'Flavor')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_cones.pivot('Color', 'Flavor', values = 'Price', collect = list)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pivot Examples ##"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nba = Table.read_table('nba_salaries.csv').relabeled(\"'15-'16 SALARY\", 'SALARY')\n",
    "nba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for each team, average salary paid for each position\n",
    "\n",
    "nba.pivot('POSITION', 'TEAM', values = 'SALARY', collect = np.average)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for each team,\n",
    "# amount paid to \"starter\" (player earning the most) in each position\n",
    "\n",
    "starters = nba.pivot('POSITION', 'TEAM', values = 'SALARY', collect = max)\n",
    "starters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# sort teams by total amount paid to starters\n",
    "\n",
    "totals = starters.drop('TEAM').apply(sum)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "starters.with_column('TOTAL', totals).sort('TOTAL', descending=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Join Examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Make table 1\n",
    "drinks = Table(['Drink', 'Cafe', 'Price']).with_rows([\n",
    "    ['Milk Tea', 'Tea One', 4],\n",
    "    ['Espresso', 'Nefeli',  2],\n",
    "    ['Latte',    'Nefeli',  3],\n",
    "    ['Espresso', \"Abe's\",   2]\n",
    "])\n",
    "drinks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Make table 2\n",
    "discounts = Table().with_columns(\n",
    "    'Coupon % off', make_array(25, 50, 5),\n",
    "    'Location', make_array('Tea One', 'Nefeli', 'Tea One')\n",
    ")\n",
    "discounts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "combined = drinks.join('Cafe', discounts, 'Location')\n",
    "combined"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "discounted_prices = combined.column('Price') * (1 - combined.column('Coupon % off')/ 100)\n",
    "combined.with_column('Discounted', discounted_prices)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "drinks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Join table with itself\n",
    "two = drinks.join('Cafe', drinks)\n",
    "two"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "two.with_column('Total', two.column('Price') + two.column('Price_2'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Join Examples: Tennis"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "There was recently discussion about whether women tennis players get penalized more than men, for coaching violations.  Let's look at some data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data on number of code violations at tennis Grand Slam tournaments, 1998-2018\n",
    "men_violations = Table.read_table('tennis_men_penalties.csv')\n",
    "women_violations = Table.read_table('tennis_women_penalties.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "men_violations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "women_violations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "combined = men_violations.join('Violation', women_violations)\n",
    "combined"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "combined.barh('Violation')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "men_sets = 460\n",
    "women_sets = 283\n",
    "normalized = combined.with_columns(\n",
    "    'Rate (Men)', combined.column('Count (Men)') / men_sets,\n",
    "    'Rate (Women)', combined.column('Count (Women)') / women_sets\n",
    ")\n",
    "normalized.drop('Count (Men)', 'Count (Women)').barh('Violation')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Bikes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "###-----Bike rentals in Bay Area from 9/2014-8/2015\n",
    "#Trip ID = ID for the rental\n",
    "#Duration = duration of rental (seconds)\n",
    "#Start date = start date of rental\n",
    "#Start Station = Name of the Start Station \n",
    "#Start Terminal = code for Start Terminal\n",
    "#End Date = end date of rental\n",
    "#End Station = Name of the End Station \n",
    "#End Terminal = code for End Terminal\n",
    "#Bike # = A serial number for the bike\n",
    "#Subscriber Type = \"Customer\" or \"Subscriber\"\n",
    "#Zip code \n",
    "\n",
    "trips = Table.read_table('trip.csv')\n",
    "trips"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Trips < 1800 seconds (30 min) are free!\n",
    "commute = trips.where('Duration', are.below(1800))\n",
    "commute.hist('Duration')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "commute.hist('Duration', bins=60, unit='second')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Most trips between the 250 and 550 range\n",
    "#Rough estimate of percent between 250 and 550  minutes\n",
    "(550-250) * 0.15"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Smaller/more bins, but general shape of the distributions stays the same\n",
    "commute.hist('Duration', bins=np.arange(1801), unit='second')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#What is the most frequently used start station?\n",
    "starts = commute.group('Start Station').sort('count', descending=True)\n",
    "starts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "San Francisco Caltrain Station on Townsend and 4th - place where people who take the train into the city can grab a bike to get to work, etc."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Pivot can be used to investigate between which stations people are traveling\n",
    "commute.pivot('Start Station', 'End Station')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Shortest trips"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Look at durations between start and end stations\n",
    "duration = trips.select('Start Station', 'End Station', 'Duration')\n",
    "duration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Use the group method and find the minimum duration\n",
    "#One trip started and ended at 2nd at Folsom\n",
    "shortest = duration.group(['Start Station', 'End Station'], min)\n",
    "shortest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Civic Center Bart (7th at Market) is a popular location\n",
    "#We can investigate where people are headed from there, along with the shortest trip duration\n",
    "from_cc = shortest.where('Start Station', are.containing('Civic Center BART')).sort('Duration min')\n",
    "from_cc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Maps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Locations of each bike station\n",
    "stations = Table.read_table('station.csv').relabel(\"name\",\"labels\")\n",
    "stations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#Draw map where stations are located\n",
    "Marker.map_table(stations.select('lat', 'long', 'labels'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sf = stations.where('landmark', 'San Francisco')\n",
    "Circle.map_table(sf.select('lat', 'long', 'labels'), color='green', radius=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Color the markers according to the landmark\n",
    "colors = stations.group('landmark').with_column(\n",
    "    'colors', make_array('blue', 'red', 'green', 'orange', 'purple'))\n",
    "colors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "stations.join('landmark', colors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "colored = stations.join('landmark', colors).select('lat', 'long', 'labels', 'colors')\n",
    "Marker.map_table(colored)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Recall the counts per Start Station\n",
    "starts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "station_starts = stations.join('labels', starts, 'Start Station')\n",
    "station_starts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Circle.map_table(station_starts.select('lat', 'long', 'labels').with_columns(\n",
    "    'colors', 'blue',\n",
    "    'areas', station_starts.column('count')\n",
    "))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}