{ "cells": [ { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# Advanced pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "np.random.seed(12345)\n", "import matplotlib.pyplot as plt\n", "plt.rc('figure', figsize=(10, 6))\n", "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n", "pd.options.display.max_rows = 20\n", "np.set_printoptions(precision=4, suppress=True)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Categorical Data" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Background and Motivation" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import numpy as np; import pandas as pd\n", "values = pd.Series(['apple', 'orange', 'apple',\n", " 'apple'] * 2)\n", "values\n", "pd.unique(values)\n", "pd.value_counts(values)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "values = pd.Series([0, 1, 0, 0] * 2)\n", "dim = pd.Series(['apple', 'orange'])\n", "values\n", "dim" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "dim.take(values)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Categorical Type in pandas" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "fruits = ['apple', 'orange', 'apple', 'apple'] * 2\n", "N = len(fruits)\n", "df = pd.DataFrame({'fruit': fruits,\n", " 'basket_id': np.arange(N),\n", " 'count': np.random.randint(3, 15, size=N),\n", " 'weight': np.random.uniform(0, 4, size=N)},\n", " columns=['basket_id', 'fruit', 'count', 'weight'])\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "fruit_cat = df['fruit'].astype('category')\n", "fruit_cat" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "c = fruit_cat.values\n", "type(c)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "c.categories\n", "c.codes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "df['fruit'] = df['fruit'].astype('category')\n", "df.fruit" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])\n", "my_categories" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "categories = ['foo', 'bar', 'baz']\n", "codes = [0, 1, 2, 0, 0, 1]\n", "my_cats_2 = pd.Categorical.from_codes(codes, categories)\n", "my_cats_2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "ordered_cat = pd.Categorical.from_codes(codes, categories,\n", " ordered=True)\n", "ordered_cat" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "my_cats_2.as_ordered()" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Computations with Categoricals" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "np.random.seed(12345)\n", "draws = np.random.randn(1000)\n", "draws[:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "bins = pd.qcut(draws, 4)\n", "bins" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])\n", "bins\n", "bins.codes[:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "bins = pd.Series(bins, name='quartile')\n", "results = (pd.Series(draws)\n", " .groupby(bins)\n", " .agg(['count', 'min', 'max'])\n", " .reset_index())\n", "results" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "results['quartile']" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "#### Better performance with categoricals" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "N = 10000000\n", "draws = pd.Series(np.random.randn(N))\n", "labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "categories = labels.astype('category')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "labels.memory_usage()\n", "categories.memory_usage()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "%time _ = labels.astype('category')" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Categorical Methods" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "s = pd.Series(['a', 'b', 'c', 'd'] * 2)\n", "cat_s = s.astype('category')\n", "cat_s" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "cat_s.cat.codes\n", "cat_s.cat.categories" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "actual_categories = ['a', 'b', 'c', 'd', 'e']\n", "cat_s2 = cat_s.cat.set_categories(actual_categories)\n", "cat_s2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "cat_s.value_counts()\n", "cat_s2.value_counts()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "cat_s3 = cat_s[cat_s.isin(['a', 'b'])]\n", "cat_s3\n", "cat_s3.cat.remove_unused_categories()" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "#### Creating dummy variables for modeling" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "pd.get_dummies(cat_s)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Advanced GroupBy Use" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Group Transforms and \"Unwrapped\" GroupBys" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,\n", " 'value': np.arange(12.)})\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "g = df.groupby('key').value\n", "g.mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "g.transform(lambda x: x.mean())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "g.transform('mean')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "g.transform(lambda x: x * 2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "g.transform(lambda x: x.rank(ascending=False))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "def normalize(x):\n", " return (x - x.mean()) / x.std()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "g.transform(normalize)\n", "g.apply(normalize)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "g.transform('mean')\n", "normalized = (df['value'] - g.transform('mean')) / g.transform('std')\n", "normalized" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### Grouped Time Resampling" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "N = 15\n", "times = pd.date_range('2017-05-20 00:00', freq='1min', periods=N)\n", "df = pd.DataFrame({'time': times,\n", " 'value': np.arange(N)})\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "df.set_index('time').resample('5min').count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "df2 = pd.DataFrame({'time': times.repeat(3),\n", " 'key': np.tile(['a', 'b', 'c'], N),\n", " 'value': np.arange(N * 3.)})\n", "df2[:7]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "time_key = pd.TimeGrouper('5min')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "resampled = (df2.set_index('time')\n", " .groupby(['key', time_key])\n", " .sum())\n", "resampled\n", "resampled.reset_index()" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Techniques for Method Chaining" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "```python\n", "df = load_data()\n", "df2 = df[df['col2'] < 0]\n", "df2['col1_demeaned'] = df2['col1'] - df2['col1'].mean()\n", "result = df2.groupby('key').col1_demeaned.std()\n", "```" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "```python\n", "# Usual non-functional way\n", "df2 = df.copy()\n", "df2['k'] = v\n", "\n", "# Functional assign way\n", "df2 = df.assign(k=v)\n", "```" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "```python\n", "result = (df2.assign(col1_demeaned=df2.col1 - df2.col2.mean())\n", " .groupby('key')\n", " .col1_demeaned.std())\n", "```" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "```python\n", "df = load_data()\n", "df2 = df[df['col2'] < 0]\n", "```" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "```python\n", "df = (load_data()\n", " [lambda x: x['col2'] < 0])\n", "```" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "```python\n", "result = (load_data()\n", " [lambda x: x.col2 < 0]\n", " .assign(col1_demeaned=lambda x: x.col1 - x.col1.mean())\n", " .groupby('key')\n", " .col1_demeaned.std())\n", "```" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "### The pipe Method" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "```python\n", "a = f(df, arg1=v1)\n", "b = g(a, v2, arg3=v3)\n", "c = h(b, arg4=v4)\n", "```" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "```python\n", "result = (df.pipe(f, arg1=v1)\n", " .pipe(g, v2, arg3=v3)\n", " .pipe(h, arg4=v4))\n", "```" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "```python\n", "g = df.groupby(['key1', 'key2'])\n", "df['col1'] = df['col1'] - g.transform('mean')\n", "```" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "```python\n", "def group_demean(df, by, cols):\n", " result = df.copy()\n", " g = df.groupby(by)\n", " for c in cols:\n", " result[c] = df[c] - g[c].transform('mean')\n", " return result\n", "```" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "```python\n", "result = (df[df.col1 < 0]\n", " .pipe(group_demean, ['key1', 'key2'], ['col1']))\n", "```" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "## Conclusion" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.1" } }, "nbformat": 4, "nbformat_minor": 0 }