{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Adapted from [Computational Statistics in Python @ Duke](https://people.duke.edu/~ccc14/sta-663/UsingPandas.html)\n",
"\n",
"\n",
"\n",
"Note: Because the pandas API is a moving target, on-line tutorials go out of date. Adapting a tutorial to fit the new realities is good exercise in itself."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/mac/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:3: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n",
" This is separate from the ipykernel package so we can avoid doing imports until\n"
]
},
{
"ename": "ImportError",
"evalue": "cannot import name 'chisqprob' from 'scipy.stats' (/Users/mac/opt/anaconda3/lib/python3.7/site-packages/scipy/stats/__init__.py)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mSeries\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mPanel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mstring\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mascii_lowercase\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mletters\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mscipy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstats\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mchisqprob\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mImportError\u001b[0m: cannot import name 'chisqprob' from 'scipy.stats' (/Users/mac/opt/anaconda3/lib/python3.7/site-packages/scipy/stats/__init__.py)"
]
}
],
"source": [
"import pandas as pd\n",
"# import pandas.tseries as ts\n",
"from pandas import Series, DataFrame, Panel\n",
"from string import ascii_lowercase as letters\n",
"from scipy.stats import chisqprob"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"xs = Series(np.arange(10), index=tuple(letters[:10]))\n",
"print(xs[:3],'\\n')\n",
"print(xs[7:], '\\n')\n",
"print(xs[::3], '\\n')\n",
"print(xs[['d', 'f', 'h']], '\\n')\n",
"print(xs.d, xs.f, xs.h)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# All the numpy functions wiill work with Series objects, and return another Series\n",
"\n",
"y1, y2 = np.mean(xs), np.var(xs)\n",
"y1, y2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Matplotlib will work on Series objects too\n",
"plt.plot(xs, np.sin(xs), 'r-o', xs, np.cos(xs), 'b-x');"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(xs.values)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[numpy.random.normal](https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.random.normal.html) takes mean and standard deviation as inputs, also size (how many)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# The Series datatype can also be used to represent time series\n",
"\n",
"import datetime as dt\n",
"from pandas import date_range\n",
"\n",
"# today = dt.date.today()\n",
"today = dt.datetime.strptime('May 16 2018', '%b %d %Y') # plain Python\n",
"print(today, '\\n')\n",
"days = date_range(today, periods=35, freq='D') # for indexing\n",
"ts = Series(np.random.normal(10, 1, len(days)), index=days)\n",
"\n",
"# Extracting elements\n",
"print (ts[0:4], '\\n')\n",
"print (ts['2018-05-21':'2018-05-28'], '\\n') # Note - includes end time"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# df = ts.resample(rule='W', how=('mean', 'std', lambda x: sum(x*x))) -- OLD\n",
"\n",
"df = ts.resample(rule='W').apply([np.mean, np.std, lambda x: sum(x*x)])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Renaming columns\n",
"# The use of mean and std are problmeatic because there are also methods in dataframe with those names\n",
"# Also, is unifnormative\n",
"# So we would like to give better names to the columns of df\n",
"\n",
"df.columns = ('mu', 'sigma', 'sum_of_sq')\n",
"print(df)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print (df.mu, '\\n') # by attribute\n",
"print (df['sigma'], '\\n') # by column name"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Extracting rows from a DataFrame\n",
"\n",
"print( df[1:3], '\\n')\n",
"print( df['2018-05-16'::2])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Extracting blocks and scalars\n",
"\n",
"print (df.iat[2, 2], '\\n') # extract an element with iat()\n",
"print (df.loc['2018-05-16':'2018-06-20', 'sum_of_sq'], '\\n') # indexing by label\n",
"print (df.iloc[:3, 2], '\\n') # indexing by position\n",
"print (df.iloc[:3,:].loc[:,'sum_of_sq'], '\\n') # combining the two"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Using Boolean conditions for selecting eleements\n",
"\n",
"print (df[(df.sigma < 1) & (df.sum_of_sq < 700)], '\\n') # need parenthesis because of operator precedence\n",
"print (df.query('sigma < 1 and sum_of_sq < 700')) # the query() method allows more readable query strings"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df= np.random.binomial(100, 0.95, (9,2))\n",
"dm = np.random.binomial(100, 0.9, [12,2])\n",
"dff = DataFrame(df, columns = ['Physics', 'Math'])\n",
"dfm = DataFrame(dm, columns = ['Physics', 'Math'])\n",
"score_panel = Panel({'Girls': dff, 'Boys': dfm})\n",
"print(score_panel, '\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"score_panel['Girls'].transpose()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# find physics and math scores of girls who scored >= 93 in math\n",
"# a DataFrame is returned\n",
"score_panel.loc['Girls', score_panel.Girls.Math >= 93, :]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# import a DataFrame to play with\n",
"try:\n",
" tips = pd.read_pickle('tips.pic')\n",
"except:\n",
" tips = pd.read_csv('https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/reshape2/tips.csv', )\n",
" tips.to_pickle('tips.pic')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tips.head(n=4)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# We have an extra set of indices in the first column\n",
"# Let's get rid of it\n",
"\n",
"tips = tips.iloc[:, 1:]\n",
"tips.head(n=4)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# For an example of the split-apply-combine pattern, we want to see counts by sex and smoker status.\n",
"# In other words, we split by sex and smoker status to get 2x2 groups,\n",
"# then apply the size function to count the number of entries per group\n",
"# and finally combine the results into a new multi-index Series\n",
"\n",
"grouped = tips.groupby(['sex', 'smoker'])\n",
"grouped.size()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# If you need the margins, use the crosstab function\n",
"\n",
"pd.crosstab(tips.sex, tips.smoker, margins=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# If more than 1 column of resutls is generated, a DataFrame is returned\n",
"\n",
"grouped.mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# The returned results can be further manipulated via apply()\n",
"# For example, suppose the bill and tips are in USD but we want EUR\n",
"\n",
"import json\n",
"import urllib\n",
"\n",
"# get current conversion rate\n",
"converter = json.loads(urllib.request.urlopen('http://free.currencyconverterapi.com/api/v3/convert?q=USD_EUR&compact=ultra ').read())\n",
"print (converter)\n",
"grouped['total_bill', 'tip'].mean().apply(lambda x: x*converter['USD_EUR'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# We can also transform the original data for more convenient analysis\n",
"# For example, suppose we want standardized units for total bill and tips\n",
"\n",
"zscore = lambda x: (x - x.mean())/x.std()\n",
"\n",
"std_grouped = grouped['total_bill', 'tip'].transform(zscore)\n",
"std_grouped.head(n=4)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Suppose we want to apply a set of functions to only some columns\n",
"grouped['total_bill', 'tip'].agg(['mean', 'min', 'max'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# We can also apply specific functions to specific columns\n",
"df = grouped.agg({'total_bill': (min, max), 'tip': sum})\n",
"df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}