{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Populating the interactive namespace from numpy and matplotlib\n", "Variables now available: df\n" ] } ], "source": [ "%run 1.0-adm-load-data-2012.ipynb" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn.decomposition import PCA\n", "from sklearn.preprocessing import Imputer, StandardScaler\n", "\n", "imp = Imputer(strategy='mean')\n", "scl = StandardScaler()\n", "pca = PCA()\n", "pipeline = Pipeline([\n", " ('imp', imp),\n", " ('scl', scl),\n", " ('pca', pca),\n", " ])\n", "scaler_pipeline = Pipeline([\n", " ('imp', imp),\n", " ('scl', scl),\n", " ])\n", "data_pca = pipeline.fit_transform(df)\n", "_scaled = scaler_pipeline.transform(df)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "dem = df[data_pca[:, 0] <= 0]\n", "rep = df[data_pca[:, 0] > 0]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "rep_pca = pipeline.fit_transform(rep)\n", "rep_scaled = scaler_pipeline.transform(rep)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def plot_explained_variance(pca):\n", " import plotly\n", " from plotly.graph_objs import Scatter, Marker, Layout, XAxis, YAxis, Bar, Line\n", " plotly.offline.init_notebook_mode() # run at the start of every notebook\n", " \n", " explained_var = pca.explained_variance_ratio_\n", " cum_var_exp = np.cumsum(explained_var)\n", " \n", " plotly.offline.iplot({\n", " \"data\": [Bar(y=explained_var, name='individual explained variance'),\n", " Scatter(y=cum_var_exp, name='cumulative explained variance')\n", " ],\n", " \"layout\": Layout(xaxis=XAxis(title='Principal components'), yaxis=YAxis(title='Explained variance ratio'))\n", " })\n", "\n", "plot_explained_variance(pca)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def biplot(pca, dat, title='', show_points=True, components=(0, 1)):\n", " import plotly\n", " from plotly.graph_objs import Scatter, Marker, Layout, XAxis, YAxis, Bar, Line\n", " plotly.offline.init_notebook_mode() # run at the start of every notebook\n", "\n", " pc1, pc2 = components\n", " \n", " # 0,1 denote PC1 and PC2; change values for other PCs\n", " xvector = pca.components_[pc1] \n", " yvector = pca.components_[pc2]\n", "\n", " tmp = pca.transform(dat.values)\n", " xs = tmp[:,pc1] \n", " ys = tmp[:,pc2]\n", " if show_points:\n", " annotations = [Scatter(x=xs, y=ys, mode ='markers', marker=dict(size=1), name='cumulative explained variance')]\n", " else:\n", " annotations = []\n", " for i in range(len(xvector)):\n", " txt = list(dat.columns.values)[i]\n", " annotations.append(\n", " Scatter(\n", " x=[0, xvector[i]*max(xs)],\n", " y=[0, yvector[i]*max(ys)],\n", " mode='lines+text',\n", " text=['', txt],\n", " name=txt,\n", " ))\n", " \n", " plotly.offline.iplot({\n", " \"data\": annotations,\n", " \"layout\": Layout(xaxis=XAxis(title='Principal Component ' + str(pc1 + 1)), \n", " yaxis=YAxis(title='Principal Component ' + str(pc2 + 1)),\n", " title=title)\n", " })\n", "\n", "\n", " plt.show()\n", "biplot(pca, pd.DataFrame(rep_scaled, columns=df.columns), title='Biplot for conservatives', components=(0, 1))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "campfin_limcorp 0.404807\n", "pid_self -0.369540\n", "spsrvpr_ssself 2.838948\n", "defsppr_self -4.629902\n", "inspre_self -5.249009\n", "gun_control 0.179421\n", "guarpr_self -5.256634\n", "immig_policy 2.282252\n", "aidblack_self -5.851562\n", "envjob_self -4.224597\n", "aa_uni -0.676372\n", "fedspend_ss 0.276444\n", "fedspend_schools 0.217910\n", "fedspend_scitech 0.208832\n", "fedspend_crime 0.358561\n", "fedspend_welfare -0.666667\n", "envir_gwarm -1.322544\n", "gayrt_marry -2.135054\n", "penalty_favdpen 1.148679\n", "relig_churchoft 2.157926\n", "dem_edu 10.723117\n", "dem_veteran 1.832222\n", "budget_rdefctax 0.037067\n", "budget_rdefmil -0.414125\n", "patriot_amident 1.485612\n", "milln_milltax 0.424157\n", "budget_rdef250k 1.704536\n", "fairjob_opin -1.792572\n", "immigpo_jobs 2.419887\n", "wiretap_warrant 0.368209\n", "postvote_presvt 1.014540\n", "dtype: float64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rep.mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" } }, "nbformat": 4, "nbformat_minor": 1 }