{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%load_ext load_style\n", "%load_style talk.css" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# k-means clustering of SST anomalies in the Pacific with scikit-learn" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%matplotlib inline\n", "from matplotlib import pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import os\n", "import numpy as np\n", "import pandas as pd\n", "from mpl_toolkits.basemap import Basemap as bm" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import xray" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### defines a function to plot a field (must be 2D)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "def plot_field(X, lat, lon, vmin, vmax, step, cmap=plt.get_cmap('jet'), ax=False, title=False, grid=False):\n", " if not ax: \n", " f, ax = plt.subplots(figsize=(10, (X.shape[0] / float(X.shape[1])) * 10))\n", " m.ax = ax\n", " im = m.contourf(lons, lats, X, np.arange(vmin, vmax+step, step), latlon=True, cmap=cmap, extend='both', ax=ax)\n", " m.drawcoastlines()\n", " if grid: \n", " m.drawmeridians(np.arange(0, 360, 60), labels=[0,0,0,1])\n", " m.drawparallels([-40, 0, 40], labels=[1,0,0,0])\n", " m.colorbar(im)\n", " if title: \n", " ax.set_title(title)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### reads the PCs (obtained before, see [sklearn_EOF_decomposition.ipynb](./sklearn_EOF_decomposition.ipynb))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "PCs = pd.read_csv('../data/EOF_ERSST_PCs.csv', index_col=0, parse_dates=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "PCs.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### import the KMeans class from scikit-learn" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.cluster import KMeans" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### How many clusters do we want ? " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "nclusters = 6" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### instantiates the k-means class" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "KMeans?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "kmeans = KMeans(init='k-means++', n_clusters=nclusters, n_init=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### fit to the data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "X = PCs.values" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "type(X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "kmeans.fit(X) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### the classes (clusters) are contained in the .labels_ attributes" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "kmeans.labels_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Calculate composite anomalies for each of the clusters" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "ncfname = '../data/ersst.realtime.nc'" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dset = xray.open_dataset(ncfname)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dsub = dset.sel(time=slice('1980','2014'),lat=slice(-40,40), lon=slice(120,290))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dsub" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "lat = dsub['lat'].values\n", "lon = dsub['lon'].values\n", "lons, lats = np.meshgrid(lon, lat)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "labels = pd.DataFrame(kmeans.labels_, index=dsub['time'], columns=['cluster'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "labels.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "pd.unique(labels.cluster)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "c = 0" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "clussub = labels.query('cluster == {}'.format(c))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "clussub.index" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "cluster = dsub.sel(time=clussub.index).mean('time')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.imshow(cluster['anom'][0,::-1,:])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "m = bm(projection='cyl',llcrnrlat=-40,urcrnrlat=40,\\\n", " llcrnrlon=120,urcrnrlon=290,\\\n", " lat_ts=0,resolution='c')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "f, axes = plt.subplots(nrows=3,ncols=2, figsize=(14,10))\n", "f.subplots_adjust(hspace=0.1, wspace=0.1)\n", "axes = axes.flatten()\n", "for c in xrange(nclusters):\n", " index = labels.query('cluster == {}'.format(c))\n", " cluster = dsub.sel(time=index.index).mean('time')\n", " ax = axes[c]\n", " plot_field(cluster['anom'][0,:,:], lats, lons, -2, 2, 0.1, \\\n", " ax=ax, cmap=plt.get_cmap('RdBu_r'), \\\n", " title=\"Cluster #{}: {} months\".format(c+1, len(index)))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!ipython nbconvert sklearn_kmeans.ipynb --to html" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }