{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%load_ext load_style\n", "%load_style talk.css" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# PCA of SST anomalies in the Pacific with scikit-learn" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from numpy import ma\n", "from matplotlib import pyplot as plt\n", "from mpl_toolkits.basemap import Basemap" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "%matplotlib inline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## load the SST data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The file (74 Mb) can be downloaded at `ftp://ftp.niwa.co.nz/incoming/fauchereaun/ersst.realtime.nc`" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import xray" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dset = xray.open_dataset('../data/ersst.realtime.nc')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### selects the period 1980 - 2014 and the tropical Pacific domain" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "dsub = dset.sel(time=slice('1980','2014'), zlev=0, lat=slice(-40,40), lon=slice(120,290))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "lat = dsub['lat'].values\n", "lon = dsub['lon'].values\n", "sst = dsub['anom'].values" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "sst.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### reshape in 2D (time, space)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "X = np.reshape(sst, (sst.shape[0], len(lat) * len(lon)), order='F')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "np.any(np.isnan(X))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Mask the land points" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "type(X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "X = ma.masked_array(X, np.isnan(X))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "type(X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "land = X.sum(0).mask" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "ocean = -land" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### keep only oceanic grid-points" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "X = X[:,ocean]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Standardize SST using the fit and transform methods of the `sklearn.preprocessing.scaler.StandardScaler`" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn import preprocessing\n", "scaler = preprocessing.StandardScaler()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "scaler_sst = scaler.fit(X)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Once the scaler object has been 'trained' on the data, we can save it as a pickle object" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.externals import joblib" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "joblib.dump(scaler_sst, '../data/scaler_sst.pkl', compress=9)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "scaler_sst = joblib.load('../data/scaler_sst.pkl')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### scales: use the `transform` method of the scaler object" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "X = scaler_sst.transform(X)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### verify that mean = 0 and std = 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "X.mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "X.std()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "X.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### EOF decomposition " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.decomposition import pca" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### instantiates the PCA object" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "skpca = pca.PCA()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### fit" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "skpca.fit(X)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Now saves the (fitted) PCA object for reuse in operations" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "joblib.dump(skpca, '../data/EOF.pkl', compress=9)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from matplotlib import style\n", "style.use('fivethirtyeight')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "style.available" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "f, ax = plt.subplots(figsize=(6,6))\n", "ax.plot(skpca.explained_variance_ratio_[0:10]*100)\n", "ax.plot(skpca.explained_variance_ratio_[0:10]*100,'ro')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### keep number of PC sufficient to explain 70 % of the original variance " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "ipc = np.where(skpca.explained_variance_ratio_.cumsum() >= 0.70)[0][0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "ipc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### The Principal Components (PCs) are obtained by using the `transform` method of the `pca` object (`skpca`)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "PCs = skpca.transform(X)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "PCs = PCs[:,:ipc]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### The Empirical Orthogonal Functions (EOFs) are contained in the `components_` attribute of the `pca` object (`skpca`)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "EOFs = skpca.components_" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "EOFs = EOFs[:ipc,:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "EOFs.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### we can the reconstruct the 2D fields (maps)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "EOF_recons = np.ones((ipc, len(lat) * len(lon))) * -999." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "for i in xrange(ipc): \n", " EOF_recons[i,ocean] = EOFs[i,:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "EOF_recons = ma.masked_values(np.reshape(EOF_recons, (ipc, len(lat), len(lon)), order='F'), -999.)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "EOF_recons.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "plt.imshow(EOF_recons[0,:,:], origin='lower', interpolation='nearest', aspect='auto')\n", "plt.colorbar();" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### scale the Principal Components" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "scaler_PCs = StandardScaler()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "scaler_PCs.fit(PCs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "PCs_std = scaler_PCs.transform(PCs)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "joblib.dump(scaler_PCs, '../data/scaler_PCs.pkl')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "PCdf = pd.DataFrame(PCs_std, index = dsub['time'], \\\n", " columns = [\"EOF%s\" % (x) for x in xrange(1, PCs_std.shape[1] +1)])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "PCdf.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "PCdf.to_csv('../data/EOF_ERSST_PCs.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "from scipy.signal import detrend" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "f, ax = plt.subplots(figsize=(12,6))\n", "PCdf.ix[:,0].plot(ax=ax, color='k', label='PC1')\n", "#ax.set_xlabel('period', fontsize=18)\n", "ax.plot(PCdf.index, detrend(PCdf.ix[:,0].values), 'r', label='PC1 (trend removed)')\n", "ax.grid('off')\n", "ax.legend(loc=1); " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ "!ipython nbconvert sklearn_EOF_decomposition.ipynb --to html" ] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }