{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n",
"Variables now available: df\n"
]
}
],
"source": [
"%run 1.0-adm-load-data-2012.ipynb"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.pipeline import Pipeline\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.preprocessing import Imputer, StandardScaler\n",
"\n",
"imp = Imputer(strategy='mean')\n",
"scl = StandardScaler()\n",
"pca = PCA()\n",
"pipeline = Pipeline([\n",
" ('imp', imp),\n",
" ('scl', scl),\n",
" ('pca', pca),\n",
" ])\n",
"scaler_pipeline = Pipeline([\n",
" ('imp', imp),\n",
" ('scl', scl),\n",
" ])\n",
"data_pca = pipeline.fit_transform(df)\n",
"_scaled = scaler_pipeline.transform(df)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"dem = df[data_pca[:, 0] <= 0]\n",
"rep = df[data_pca[:, 0] > 0]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"rep_pca = pipeline.fit_transform(rep)\n",
"rep_scaled = scaler_pipeline.transform(rep)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def plot_explained_variance(pca):\n",
" import plotly\n",
" from plotly.graph_objs import Scatter, Marker, Layout, XAxis, YAxis, Bar, Line\n",
" plotly.offline.init_notebook_mode() # run at the start of every notebook\n",
" \n",
" explained_var = pca.explained_variance_ratio_\n",
" cum_var_exp = np.cumsum(explained_var)\n",
" \n",
" plotly.offline.iplot({\n",
" \"data\": [Bar(y=explained_var, name='individual explained variance'),\n",
" Scatter(y=cum_var_exp, name='cumulative explained variance')\n",
" ],\n",
" \"layout\": Layout(xaxis=XAxis(title='Principal components'), yaxis=YAxis(title='Explained variance ratio'))\n",
" })\n",
"\n",
"plot_explained_variance(pca)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def biplot(pca, dat, title='', show_points=True, components=(0, 1)):\n",
" import plotly\n",
" from plotly.graph_objs import Scatter, Marker, Layout, XAxis, YAxis, Bar, Line\n",
" plotly.offline.init_notebook_mode() # run at the start of every notebook\n",
"\n",
" pc1, pc2 = components\n",
" \n",
" # 0,1 denote PC1 and PC2; change values for other PCs\n",
" xvector = pca.components_[pc1] \n",
" yvector = pca.components_[pc2]\n",
"\n",
" tmp = pca.transform(dat.values)\n",
" xs = tmp[:,pc1] \n",
" ys = tmp[:,pc2]\n",
" if show_points:\n",
" annotations = [Scatter(x=xs, y=ys, mode ='markers', marker=dict(size=1), name='cumulative explained variance')]\n",
" else:\n",
" annotations = []\n",
" for i in range(len(xvector)):\n",
" txt = list(dat.columns.values)[i]\n",
" annotations.append(\n",
" Scatter(\n",
" x=[0, xvector[i]*max(xs)],\n",
" y=[0, yvector[i]*max(ys)],\n",
" mode='lines+text',\n",
" text=['', txt],\n",
" name=txt,\n",
" ))\n",
" \n",
" plotly.offline.iplot({\n",
" \"data\": annotations,\n",
" \"layout\": Layout(xaxis=XAxis(title='Principal Component ' + str(pc1 + 1)), \n",
" yaxis=YAxis(title='Principal Component ' + str(pc2 + 1)),\n",
" title=title)\n",
" })\n",
"\n",
"\n",
" plt.show()\n",
"biplot(pca, pd.DataFrame(rep_scaled, columns=df.columns), title='Biplot for conservatives', components=(0, 1))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"campfin_limcorp 0.404807\n",
"pid_self -0.369540\n",
"spsrvpr_ssself 2.838948\n",
"defsppr_self -4.629902\n",
"inspre_self -5.249009\n",
"gun_control 0.179421\n",
"guarpr_self -5.256634\n",
"immig_policy 2.282252\n",
"aidblack_self -5.851562\n",
"envjob_self -4.224597\n",
"aa_uni -0.676372\n",
"fedspend_ss 0.276444\n",
"fedspend_schools 0.217910\n",
"fedspend_scitech 0.208832\n",
"fedspend_crime 0.358561\n",
"fedspend_welfare -0.666667\n",
"envir_gwarm -1.322544\n",
"gayrt_marry -2.135054\n",
"penalty_favdpen 1.148679\n",
"relig_churchoft 2.157926\n",
"dem_edu 10.723117\n",
"dem_veteran 1.832222\n",
"budget_rdefctax 0.037067\n",
"budget_rdefmil -0.414125\n",
"patriot_amident 1.485612\n",
"milln_milltax 0.424157\n",
"budget_rdef250k 1.704536\n",
"fairjob_opin -1.792572\n",
"immigpo_jobs 2.419887\n",
"wiretap_warrant 0.368209\n",
"postvote_presvt 1.014540\n",
"dtype: float64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"rep.mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}