{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Adapted from [Computational Statistics in Python @ Duke](https://people.duke.edu/~ccc14/sta-663/UsingPandas.html)\n",
    "\n",
    "<a data-flickr-embed=\"true\" href=\"https://www.flickr.com/photos/kirbyurner/43857854005/in/album-72157660337424600/\" title=\"Anaconda3 Submenu on Win 10\"><img src=\"https://live.staticflickr.com/1897/43857854005_666b1bce45_n.jpg\" width=\"320\" height=\"161\" alt=\"Anaconda3 Submenu on Win 10\"></a><script async src=\"//embedr.flickr.com/assets/client-code.js\" charset=\"utf-8\"></script>\n",
    "\n",
    "Note: Because the pandas API is a moving target, on-line tutorials go out of date.  Adapting a tutorial to fit the new realities is good exercise in itself."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/mac/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n",
      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
     ]
    },
    {
     "ename": "ImportError",
     "evalue": "cannot import name 'chisqprob' from 'scipy.stats' (/Users/mac/opt/anaconda3/lib/python3.7/site-packages/scipy/stats/__init__.py)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-2-7d8cf4edbe10>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mSeries\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPanel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mstring\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mascii_lowercase\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mletters\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mscipy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstats\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mchisqprob\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mImportError\u001b[0m: cannot import name 'chisqprob' from 'scipy.stats' (/Users/mac/opt/anaconda3/lib/python3.7/site-packages/scipy/stats/__init__.py)"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "# import pandas.tseries as ts\n",
    "from pandas import Series, DataFrame, Panel\n",
    "from string import ascii_lowercase as letters\n",
    "from scipy.stats import chisqprob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xs = Series(np.arange(10), index=tuple(letters[:10]))\n",
    "print(xs[:3],'\\n')\n",
    "print(xs[7:], '\\n')\n",
    "print(xs[::3], '\\n')\n",
    "print(xs[['d', 'f', 'h']], '\\n')\n",
    "print(xs.d, xs.f, xs.h)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# All the numpy functions wiill work with Series objects, and return another Series\n",
    "\n",
    "y1, y2 = np.mean(xs), np.var(xs)\n",
    "y1, y2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Matplotlib will work on Series objects too\n",
    "plt.plot(xs, np.sin(xs), 'r-o', xs, np.cos(xs), 'b-x');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(xs.values)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[numpy.random.normal](https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.random.normal.html) takes mean and standard deviation as inputs, also size (how many)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The Series datatype can also be used to represent time series\n",
    "\n",
    "import datetime as dt\n",
    "from pandas import date_range\n",
    "\n",
    "# today = dt.date.today()\n",
    "today = dt.datetime.strptime('May 16 2018', '%b %d %Y') # plain Python\n",
    "print(today, '\\n')\n",
    "days = date_range(today, periods=35, freq='D')  # for indexing\n",
    "ts = Series(np.random.normal(10, 1, len(days)), index=days)\n",
    "\n",
    "# Extracting elements\n",
    "print (ts[0:4], '\\n')\n",
    "print (ts['2018-05-21':'2018-05-28'], '\\n') # Note - includes end time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df = ts.resample(rule='W', how=('mean', 'std', lambda x: sum(x*x))) -- OLD\n",
    "\n",
    "df = ts.resample(rule='W').apply([np.mean, np.std, lambda x: sum(x*x)])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Renaming columns\n",
    "# The use of mean and std are problmeatic because there are also methods in dataframe with those names\n",
    "# Also, <lambda> is unifnormative\n",
    "# So we would like to give better names to the columns of df\n",
    "\n",
    "df.columns = ('mu', 'sigma', 'sum_of_sq')\n",
    "print(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print (df.mu, '\\n') # by attribute\n",
    "print (df['sigma'], '\\n') # by column name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extracting rows from a DataFrame\n",
    "\n",
    "print( df[1:3], '\\n')\n",
    "print( df['2018-05-16'::2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extracting blocks and scalars\n",
    "\n",
    "print (df.iat[2, 2], '\\n') # extract an element with iat()\n",
    "print (df.loc['2018-05-16':'2018-06-20', 'sum_of_sq'], '\\n') # indexing by label\n",
    "print (df.iloc[:3, 2], '\\n')  # indexing by position\n",
    "print (df.iloc[:3,:].loc[:,'sum_of_sq'], '\\n') # combining the two"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Using Boolean conditions for selecting eleements\n",
    "\n",
    "print (df[(df.sigma < 1) & (df.sum_of_sq < 700)], '\\n') # need parenthesis because of operator precedence\n",
    "print (df.query('sigma < 1 and sum_of_sq < 700')) # the query() method allows more readable query strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df= np.random.binomial(100, 0.95, (9,2))\n",
    "dm = np.random.binomial(100, 0.9, [12,2])\n",
    "dff = DataFrame(df, columns = ['Physics', 'Math'])\n",
    "dfm = DataFrame(dm, columns = ['Physics', 'Math'])\n",
    "score_panel = Panel({'Girls': dff, 'Boys': dfm})\n",
    "print(score_panel, '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "score_panel['Girls'].transpose()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# find physics and math scores of girls who scored >= 93 in math\n",
    "# a DataFrame is returned\n",
    "score_panel.loc['Girls', score_panel.Girls.Math >= 93, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# import a DataFrame to play with\n",
    "try:\n",
    "    tips = pd.read_pickle('tips.pic')\n",
    "except:\n",
    "    tips = pd.read_csv('https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/reshape2/tips.csv', )\n",
    "    tips.to_pickle('tips.pic')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tips.head(n=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We have an extra set of indices in the first column\n",
    "# Let's get rid of it\n",
    "\n",
    "tips = tips.iloc[:, 1:]\n",
    "tips.head(n=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# For an example of the split-apply-combine pattern, we want to see counts by sex and smoker status.\n",
    "# In other words, we split by sex and smoker status to get 2x2 groups,\n",
    "# then apply the size function to count the number of entries per group\n",
    "# and finally combine the results into a new multi-index Series\n",
    "\n",
    "grouped = tips.groupby(['sex', 'smoker'])\n",
    "grouped.size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# If you need the margins, use the crosstab function\n",
    "\n",
    "pd.crosstab(tips.sex, tips.smoker, margins=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# If more than 1 column of resutls is generated, a DataFrame is returned\n",
    "\n",
    "grouped.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The returned results can be further manipulated via apply()\n",
    "# For example, suppose the bill and tips are in USD but we want EUR\n",
    "\n",
    "import json\n",
    "import urllib\n",
    "\n",
    "# get current conversion rate\n",
    "converter = json.loads(urllib.request.urlopen('http://free.currencyconverterapi.com/api/v3/convert?q=USD_EUR&compact=ultra ').read())\n",
    "print (converter)\n",
    "grouped['total_bill', 'tip'].mean().apply(lambda x: x*converter['USD_EUR'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We can also transform the original data for more convenient analysis\n",
    "# For example, suppose we want standardized units for total bill and tips\n",
    "\n",
    "zscore = lambda x: (x - x.mean())/x.std()\n",
    "\n",
    "std_grouped = grouped['total_bill', 'tip'].transform(zscore)\n",
    "std_grouped.head(n=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Suppose we want to apply a set of functions to only some columns\n",
    "grouped['total_bill', 'tip'].agg(['mean', 'min', 'max'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We can also apply specific functions to specific columns\n",
    "df = grouped.agg({'total_bill': (min, max), 'tip': sum})\n",
    "df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}