{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Plotting and Visualization" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from __future__ import division\n", "from numpy.random import randn\n", "import numpy as np\n", "import os\n", "import matplotlib.pyplot as plt\n", "np.random.seed(12345)\n", "plt.rc('figure', figsize=(10, 6))\n", "from pandas import Series, DataFrame\n", "import pandas as pd\n", "np.set_printoptions(precision=4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%pwd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## A brief matplotlib API primer" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Figures and Subplots" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fig = plt.figure()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "ax1 = fig.add_subplot(2, 2, 1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "ax2 = fig.add_subplot(2, 2, 2)\n", "ax3 = fig.add_subplot(2, 2, 3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from numpy.random import randn\n", "plt.plot(randn(50).cumsum(), 'k--')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "_ = ax1.hist(randn(100), bins=20, color='k', alpha=0.3)\n", "ax2.scatter(np.arange(30), np.arange(30) + 3 * randn(30))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.close('all')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fig, axes = plt.subplots(2, 3)\n", "axes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Adjusting the spacing around subplots" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.subplots_adjust(left=None, bottom=None, right=None, top=None,\n", " wspace=None, hspace=None)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)\n", "for i in range(2):\n", " for j in range(2):\n", " axes[i, j].hist(randn(500), bins=50, color='k', alpha=0.5)\n", "plt.subplots_adjust(wspace=0, hspace=0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)\n", "for i in range(2):\n", " for j in range(2):\n", " axes[i, j].hist(randn(500), bins=50, color='k', alpha=0.5)\n", "plt.subplots_adjust(wspace=0, hspace=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Colors, markers, and line styles" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.figure()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.plot(randn(30).cumsum(), 'ko--')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.close('all')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data = randn(30).cumsum()\n", "plt.plot(data, 'k--', label='Default')\n", "plt.plot(data, 'k-', drawstyle='steps-post', label='steps-post')\n", "plt.legend(loc='best')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Ticks, labels, and legends" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Setting the title, axis labels, ticks, and ticklabels" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fig = plt.figure(); ax = fig.add_subplot(1, 1, 1)\n", "ax.plot(randn(1000).cumsum())\n", "\n", "ticks = ax.set_xticks([0, 250, 500, 750, 1000])\n", "labels = ax.set_xticklabels(['one', 'two', 'three', 'four', 'five'],\n", " rotation=30, fontsize='small')\n", "ax.set_title('My first matplotlib plot')\n", "ax.set_xlabel('Stages')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Adding legends" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fig = plt.figure(); ax = fig.add_subplot(1, 1, 1)\n", "ax.plot(randn(1000).cumsum(), 'k', label='one')\n", "ax.plot(randn(1000).cumsum(), 'k--', label='two')\n", "ax.plot(randn(1000).cumsum(), 'k.', label='three')\n", "\n", "ax.legend(loc='best')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Annotations and drawing on a subplot" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from datetime import datetime\n", "\n", "fig = plt.figure()\n", "ax = fig.add_subplot(1, 1, 1)\n", "\n", "data = pd.read_csv('ch08/spx.csv', index_col=0, parse_dates=True)\n", "spx = data['SPX']\n", "\n", "spx.plot(ax=ax, style='k-')\n", "\n", "crisis_data = [\n", " (datetime(2007, 10, 11), 'Peak of bull market'),\n", " (datetime(2008, 3, 12), 'Bear Stearns Fails'),\n", " (datetime(2008, 9, 15), 'Lehman Bankruptcy')\n", "]\n", "\n", "for date, label in crisis_data:\n", " ax.annotate(label, xy=(date, spx.asof(date) + 50),\n", " xytext=(date, spx.asof(date) + 200),\n", " arrowprops=dict(facecolor='black'),\n", " horizontalalignment='left', verticalalignment='top')\n", "\n", "# Zoom in on 2007-2010\n", "ax.set_xlim(['1/1/2007', '1/1/2011'])\n", "ax.set_ylim([600, 1800])\n", "\n", "ax.set_title('Important dates in 2008-2009 financial crisis')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fig = plt.figure()\n", "ax = fig.add_subplot(1, 1, 1)\n", "\n", "rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3)\n", "circ = plt.Circle((0.7, 0.2), 0.15, color='b', alpha=0.3)\n", "pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],\n", " color='g', alpha=0.5)\n", "\n", "ax.add_patch(rect)\n", "ax.add_patch(circ)\n", "ax.add_patch(pgon)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Saving plots to file" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fig" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fig.savefig('figpath.svg')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fig.savefig('figpath.png', dpi=400, bbox_inches='tight')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from io import BytesIO\n", "buffer = BytesIO()\n", "plt.savefig(buffer)\n", "plot_data = buffer.getvalue()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### matplotlib configuration" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.rc('figure', figsize=(10, 10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plotting functions in pandas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Line plots" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.close('all')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "s = Series(np.random.randn(10).cumsum(), index=np.arange(0, 100, 10))\n", "s.plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df = DataFrame(np.random.randn(10, 4).cumsum(0),\n", " columns=['A', 'B', 'C', 'D'],\n", " index=np.arange(0, 100, 10))\n", "df.plot()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Bar plots" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fig, axes = plt.subplots(2, 1)\n", "data = Series(np.random.rand(16), index=list('abcdefghijklmnop'))\n", "data.plot(kind='bar', ax=axes[0], color='k', alpha=0.7)\n", "data.plot(kind='barh', ax=axes[1], color='k', alpha=0.7)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df = DataFrame(np.random.rand(6, 4),\n", " index=['one', 'two', 'three', 'four', 'five', 'six'],\n", " columns=pd.Index(['A', 'B', 'C', 'D'], name='Genus'))\n", "df\n", "df.plot(kind='bar')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.figure()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "df.plot(kind='barh', stacked=True, alpha=0.5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "tips = pd.read_csv('ch08/tips.csv')\n", "party_counts = pd.crosstab(tips.day, tips.size)\n", "party_counts\n", "# Not many 1- and 6-person parties\n", "party_counts = party_counts.ix[:, 2:5]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "# Normalize to sum to 1\n", "party_pcts = party_counts.div(party_counts.sum(1).astype(float), axis=0)\n", "party_pcts\n", "\n", "party_pcts.plot(kind='bar', stacked=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Histograms and density plots" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.figure()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "tips['tip_pct'] = tips['tip'] / tips['total_bill']\n", "tips['tip_pct'].hist(bins=50)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.figure()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "tips['tip_pct'].plot(kind='kde')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.figure()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "comp1 = np.random.normal(0, 1, size=200) # N(0, 1)\n", "comp2 = np.random.normal(10, 2, size=200) # N(10, 4)\n", "values = Series(np.concatenate([comp1, comp2]))\n", "values.hist(bins=100, alpha=0.3, color='k', normed=True)\n", "values.plot(kind='kde', style='k--')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Scatter plots" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "macro = pd.read_csv('ch08/macrodata.csv')\n", "data = macro[['cpi', 'm1', 'tbilrate', 'unemp']]\n", "trans_data = np.log(data).diff().dropna()\n", "trans_data[-5:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.figure()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.scatter(trans_data['m1'], trans_data['unemp'])\n", "plt.title('Changes in log %s vs. log %s' % ('m1', 'unemp'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pd.scatter_matrix(trans_data, diagonal='kde', color='k', alpha=0.3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plotting Maps: Visualizing Haiti Earthquake Crisis data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data = pd.read_csv('ch08/Haiti.csv')\n", "data.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data[['INCIDENT DATE', 'LATITUDE', 'LONGITUDE']][:10]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data['CATEGORY'][:6]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data = data[(data.LATITUDE > 18) & (data.LATITUDE < 20) &\n", " (data.LONGITUDE > -75) & (data.LONGITUDE < -70)\n", " & data.CATEGORY.notnull()]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def to_cat_list(catstr):\n", " stripped = (x.strip() for x in catstr.split(','))\n", " return [x for x in stripped if x]\n", "\n", "def get_all_categories(cat_series):\n", " cat_sets = (set(to_cat_list(x)) for x in cat_series)\n", " return sorted(set.union(*cat_sets))\n", "\n", "def get_english(cat):\n", " code, names = cat.split('.')\n", " if '|' in names:\n", " names = names.split(' | ')[1]\n", " return code, names.strip()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "get_english('2. Urgences logistiques | Vital Lines')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "all_cats = get_all_categories(data.CATEGORY)\n", "# Generator expression\n", "english_mapping = dict(get_english(x) for x in all_cats)\n", "english_mapping['2a']\n", "english_mapping['6c']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def get_code(seq):\n", " return [x.split('.')[0] for x in seq if x]\n", "\n", "all_codes = get_code(all_cats)\n", "code_index = pd.Index(np.unique(all_codes))\n", "dummy_frame = DataFrame(np.zeros((len(data), len(code_index))),\n", " index=data.index, columns=code_index)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dummy_frame.ix[:, :6].info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "for row, cat in zip(data.index, data.CATEGORY):\n", " codes = get_code(to_cat_list(cat))\n", " dummy_frame.ix[row, codes] = 1\n", "\n", "data = data.join(dummy_frame.add_prefix('category_'))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "data.ix[:, 10:15].info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from mpl_toolkits.basemap import Basemap\n", "import matplotlib.pyplot as plt\n", "\n", "def basic_haiti_map(ax=None, lllat=17.25, urlat=20.25,\n", " lllon=-75, urlon=-71):\n", " # create polar stereographic Basemap instance.\n", " m = Basemap(ax=ax, projection='stere',\n", " lon_0=(urlon + lllon) / 2,\n", " lat_0=(urlat + lllat) / 2,\n", " llcrnrlat=lllat, urcrnrlat=urlat,\n", " llcrnrlon=lllon, urcrnrlon=urlon,\n", " resolution='f')\n", " # draw coastlines, state and country boundaries, edge of map.\n", " m.drawcoastlines()\n", " m.drawstates()\n", " m.drawcountries()\n", " return m" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))\n", "fig.subplots_adjust(hspace=0.05, wspace=0.05)\n", "\n", "to_plot = ['2a', '1', '3c', '7a']\n", "\n", "lllat=17.25; urlat=20.25; lllon=-75; urlon=-71\n", "\n", "for code, ax in zip(to_plot, axes.flat):\n", " m = basic_haiti_map(ax, lllat=lllat, urlat=urlat,\n", " lllon=lllon, urlon=urlon)\n", "\n", " cat_data = data[data['category_%s' % code] == 1]\n", "\n", " # compute map proj coordinates.\n", " x, y = m(cat_data.LONGITUDE.values, cat_data.LATITUDE.values)\n", "\n", " m.plot(x, y, 'k.', alpha=0.5)\n", " ax.set_title('%s: %s' % (code, english_mapping[code]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))\n", "fig.subplots_adjust(hspace=0.05, wspace=0.05)\n", "\n", "to_plot = ['2a', '1', '3c', '7a']\n", "\n", "lllat=17.25; urlat=20.25; lllon=-75; urlon=-71\n", "\n", "def make_plot():\n", "\n", " for i, code in enumerate(to_plot):\n", " cat_data = data[data['category_%s' % code] == 1]\n", " lons, lats = cat_data.LONGITUDE, cat_data.LATITUDE\n", "\n", " ax = axes.flat[i]\n", " m = basic_haiti_map(ax, lllat=lllat, urlat=urlat,\n", " lllon=lllon, urlon=urlon)\n", "\n", " # compute map proj coordinates.\n", " x, y = m(lons.values, lats.values)\n", "\n", " m.plot(x, y, 'k.', alpha=0.5)\n", " ax.set_title('%s: %s' % (code, english_mapping[code]))\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "make_plot()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "shapefile_path = 'ch08/PortAuPrince_Roads/PortAuPrince_Roads'\n", "m.readshapefile(shapefile_path, 'roads')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.10" } }, "nbformat": 4, "nbformat_minor": 0 }