{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Adapted from [Computational Statistics in Python @ Duke](https://people.duke.edu/~ccc14/sta-663/UsingPandas.html)\n", "\n", "\"Anaconda3\n", "\n", "Note: Because the pandas API is a moving target, on-line tutorials go out of date. Adapting a tutorial to fit the new realities is good exercise in itself." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "import numpy as np\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/mac/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n", " This is separate from the ipykernel package so we can avoid doing imports until\n" ] }, { "ename": "ImportError", "evalue": "cannot import name 'chisqprob' from 'scipy.stats' (/Users/mac/opt/anaconda3/lib/python3.7/site-packages/scipy/stats/__init__.py)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mSeries\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPanel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mstring\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mascii_lowercase\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mletters\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mscipy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstats\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mchisqprob\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mImportError\u001b[0m: cannot import name 'chisqprob' from 'scipy.stats' (/Users/mac/opt/anaconda3/lib/python3.7/site-packages/scipy/stats/__init__.py)" ] } ], "source": [ "import pandas as pd\n", "# import pandas.tseries as ts\n", "from pandas import Series, DataFrame, Panel\n", "from string import ascii_lowercase as letters\n", "from scipy.stats import chisqprob" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "xs = Series(np.arange(10), index=tuple(letters[:10]))\n", "print(xs[:3],'\\n')\n", "print(xs[7:], '\\n')\n", "print(xs[::3], '\\n')\n", "print(xs[['d', 'f', 'h']], '\\n')\n", "print(xs.d, xs.f, xs.h)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# All the numpy functions wiill work with Series objects, and return another Series\n", "\n", "y1, y2 = np.mean(xs), np.var(xs)\n", "y1, y2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Matplotlib will work on Series objects too\n", "plt.plot(xs, np.sin(xs), 'r-o', xs, np.cos(xs), 'b-x');" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(xs.values)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[numpy.random.normal](https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.random.normal.html) takes mean and standard deviation as inputs, also size (how many)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# The Series datatype can also be used to represent time series\n", "\n", "import datetime as dt\n", "from pandas import date_range\n", "\n", "# today = dt.date.today()\n", "today = dt.datetime.strptime('May 16 2018', '%b %d %Y') # plain Python\n", "print(today, '\\n')\n", "days = date_range(today, periods=35, freq='D') # for indexing\n", "ts = Series(np.random.normal(10, 1, len(days)), index=days)\n", "\n", "# Extracting elements\n", "print (ts[0:4], '\\n')\n", "print (ts['2018-05-21':'2018-05-28'], '\\n') # Note - includes end time" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# df = ts.resample(rule='W', how=('mean', 'std', lambda x: sum(x*x))) -- OLD\n", "\n", "df = ts.resample(rule='W').apply([np.mean, np.std, lambda x: sum(x*x)])\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Renaming columns\n", "# The use of mean and std are problmeatic because there are also methods in dataframe with those names\n", "# Also, is unifnormative\n", "# So we would like to give better names to the columns of df\n", "\n", "df.columns = ('mu', 'sigma', 'sum_of_sq')\n", "print(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print (df.mu, '\\n') # by attribute\n", "print (df['sigma'], '\\n') # by column name" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Extracting rows from a DataFrame\n", "\n", "print( df[1:3], '\\n')\n", "print( df['2018-05-16'::2])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Extracting blocks and scalars\n", "\n", "print (df.iat[2, 2], '\\n') # extract an element with iat()\n", "print (df.loc['2018-05-16':'2018-06-20', 'sum_of_sq'], '\\n') # indexing by label\n", "print (df.iloc[:3, 2], '\\n') # indexing by position\n", "print (df.iloc[:3,:].loc[:,'sum_of_sq'], '\\n') # combining the two" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Using Boolean conditions for selecting eleements\n", "\n", "print (df[(df.sigma < 1) & (df.sum_of_sq < 700)], '\\n') # need parenthesis because of operator precedence\n", "print (df.query('sigma < 1 and sum_of_sq < 700')) # the query() method allows more readable query strings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df= np.random.binomial(100, 0.95, (9,2))\n", "dm = np.random.binomial(100, 0.9, [12,2])\n", "dff = DataFrame(df, columns = ['Physics', 'Math'])\n", "dfm = DataFrame(dm, columns = ['Physics', 'Math'])\n", "score_panel = Panel({'Girls': dff, 'Boys': dfm})\n", "print(score_panel, '\\n')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "score_panel['Girls'].transpose()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# find physics and math scores of girls who scored >= 93 in math\n", "# a DataFrame is returned\n", "score_panel.loc['Girls', score_panel.Girls.Math >= 93, :]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# import a DataFrame to play with\n", "try:\n", " tips = pd.read_pickle('tips.pic')\n", "except:\n", " tips = pd.read_csv('https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/reshape2/tips.csv', )\n", " tips.to_pickle('tips.pic')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tips.head(n=4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# We have an extra set of indices in the first column\n", "# Let's get rid of it\n", "\n", "tips = tips.iloc[:, 1:]\n", "tips.head(n=4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# For an example of the split-apply-combine pattern, we want to see counts by sex and smoker status.\n", "# In other words, we split by sex and smoker status to get 2x2 groups,\n", "# then apply the size function to count the number of entries per group\n", "# and finally combine the results into a new multi-index Series\n", "\n", "grouped = tips.groupby(['sex', 'smoker'])\n", "grouped.size()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# If you need the margins, use the crosstab function\n", "\n", "pd.crosstab(tips.sex, tips.smoker, margins=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# If more than 1 column of resutls is generated, a DataFrame is returned\n", "\n", "grouped.mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# The returned results can be further manipulated via apply()\n", "# For example, suppose the bill and tips are in USD but we want EUR\n", "\n", "import json\n", "import urllib\n", "\n", "# get current conversion rate\n", "converter = json.loads(urllib.request.urlopen('http://free.currencyconverterapi.com/api/v3/convert?q=USD_EUR&compact=ultra ').read())\n", "print (converter)\n", "grouped['total_bill', 'tip'].mean().apply(lambda x: x*converter['USD_EUR'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# We can also transform the original data for more convenient analysis\n", "# For example, suppose we want standardized units for total bill and tips\n", "\n", "zscore = lambda x: (x - x.mean())/x.std()\n", "\n", "std_grouped = grouped['total_bill', 'tip'].transform(zscore)\n", "std_grouped.head(n=4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Suppose we want to apply a set of functions to only some columns\n", "grouped['total_bill', 'tip'].agg(['mean', 'min', 'max'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# We can also apply specific functions to specific columns\n", "df = grouped.agg({'total_bill': (min, max), 'tip': sum})\n", "df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }