{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Welcome to your Quantitative Social Sciences Analysis Toolkit!\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. load survey data into the notebook \n", "(run this first & run this everytime you close and reopen the notebook)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "import pandas as pd # load a specialized piece of software that will help us with the analysis\n", "data = pd.read_csv('data/anes_pilot_2019.csv',low_memory=False) # read in the table of data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. display data" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...3155315631573158315931603161316231633164
versionANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204...ANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204ANES 2019 Pilot Study version 20200204
caseid12345678910...3156315731583159316031613162316331643165
weight1.34719693063187.780822076219216.9663669306949571.103485147803741.090697302567411.02140871415171.964514474045239.834692588582321.535415420208531.32458088383641...1.17827101584555.783602487218187.7925087444237367.03646496881757.8928332361473031.58161278448241.809576969671362
weight_spss1.10160293017768.638478211724453.790198239229266.902319805359118.891863184309371.835205905561853.788683485426792.6825281296837631.255509189104511.08310978871303....963472209656906.640751753798312.6480344003152895.75371740500213.730069737197651.29328477387127.661991088100273
form1112222111...2122122221
..................................................................
starttime12/31/2019 18:57:3312/21/2019 4:19:5612/22/2019 23:03:2812/31/2019 19:53:1412/21/2019 4:07:0912/21/2019 22:45:1812/27/2019 19:16:0512/21/2019 23:21:5512/25/2019 5:39:5112/28/2019 3:09:16...12/31/2019 19:41:5312/31/2019 19:40:2812/31/2019 19:40:5912/31/2019 19:41:2612/31/2019 19:42:1312/31/2019 19:38:1312/31/2019 20:14:3412/31/2019 20:10:0412/31/2019 22:10:0512/31/2019 23:27:51
endtime12/31/2019 19:39:4912/21/2019 4:53:1912/22/2019 23:41:4312/31/2019 20:23:1112/21/2019 4:48:5012/22/2019 0:28:2712/27/2019 19:45:4512/21/2019 23:40:2012/25/2019 5:57:2112/28/2019 3:35:48...12/31/2019 20:08:2012/31/2019 20:17:5012/31/2019 20:13:3212/31/2019 20:22:4512/31/2019 20:28:2312/31/2019 20:24:5612/31/2019 20:53:5012/31/2019 20:29:1512/31/2019 22:52:371/1/2020 0:21:59
duration2536200322951797250161891780110510501592...1587224219532479277028032356115125523248
pop_density_public15201800707600443011900700450005700120...40037002000180020066001
flag_state0000000000...0000000
\n", "

900 rows × 3165 columns

\n", "
" ], "text/plain": [ " 0 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 1 \n", "weight 1.34719693063187 \n", "weight_spss 1.10160293017768 \n", "form 1 \n", "... ... \n", "starttime 12/31/2019 18:57:33 \n", "endtime 12/31/2019 19:39:49 \n", "duration 2536 \n", "pop_density_public 1520 \n", "flag_state 0 \n", "\n", " 1 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 2 \n", "weight .780822076219216 \n", "weight_spss .638478211724453 \n", "form 1 \n", "... ... \n", "starttime 12/21/2019 4:19:56 \n", "endtime 12/21/2019 4:53:19 \n", "duration 2003 \n", "pop_density_public 1800 \n", "flag_state 0 \n", "\n", " 2 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 3 \n", "weight .966366930694957 \n", "weight_spss .790198239229266 \n", "form 1 \n", "... ... \n", "starttime 12/22/2019 23:03:28 \n", "endtime 12/22/2019 23:41:43 \n", "duration 2295 \n", "pop_density_public 70 \n", "flag_state 0 \n", "\n", " 3 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 4 \n", "weight 1.10348514780374 \n", "weight_spss .902319805359118 \n", "form 2 \n", "... ... \n", "starttime 12/31/2019 19:53:14 \n", "endtime 12/31/2019 20:23:11 \n", "duration 1797 \n", "pop_density_public 7600 \n", "flag_state 0 \n", "\n", " 4 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 5 \n", "weight 1.09069730256741 \n", "weight_spss .891863184309371 \n", "form 2 \n", "... ... \n", "starttime 12/21/2019 4:07:09 \n", "endtime 12/21/2019 4:48:50 \n", "duration 2501 \n", "pop_density_public 4430 \n", "flag_state 0 \n", "\n", " 5 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 6 \n", "weight 1.02140871415171 \n", "weight_spss .835205905561853 \n", "form 2 \n", "... ... \n", "starttime 12/21/2019 22:45:18 \n", "endtime 12/22/2019 0:28:27 \n", "duration 6189 \n", "pop_density_public 11900 \n", "flag_state 0 \n", "\n", " 6 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 7 \n", "weight .964514474045239 \n", "weight_spss .788683485426792 \n", "form 2 \n", "... ... \n", "starttime 12/27/2019 19:16:05 \n", "endtime 12/27/2019 19:45:45 \n", "duration 1780 \n", "pop_density_public 700 \n", "flag_state 0 \n", "\n", " 7 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 8 \n", "weight .83469258858232 \n", "weight_spss .682528129683763 \n", "form 1 \n", "... ... \n", "starttime 12/21/2019 23:21:55 \n", "endtime 12/21/2019 23:40:20 \n", "duration 1105 \n", "pop_density_public 45000 \n", "flag_state 0 \n", "\n", " 8 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 9 \n", "weight 1.53541542020853 \n", "weight_spss 1.25550918910451 \n", "form 1 \n", "... ... \n", "starttime 12/25/2019 5:39:51 \n", "endtime 12/25/2019 5:57:21 \n", "duration 1050 \n", "pop_density_public 5700 \n", "flag_state 0 \n", "\n", " 9 ... \\\n", "version ANES 2019 Pilot Study version 20200204 ... \n", "caseid 10 ... \n", "weight 1.32458088383641 ... \n", "weight_spss 1.08310978871303 ... \n", "form 1 ... \n", "... ... ... \n", "starttime 12/28/2019 3:09:16 ... \n", "endtime 12/28/2019 3:35:48 ... \n", "duration 1592 ... \n", "pop_density_public 120 ... \n", "flag_state 0 ... \n", "\n", " 3155 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 3156 \n", "weight 1.17827101584555 \n", "weight_spss .963472209656906 \n", "form 2 \n", "... ... \n", "starttime 12/31/2019 19:41:53 \n", "endtime 12/31/2019 20:08:20 \n", "duration 1587 \n", "pop_density_public 400 \n", "flag_state 0 \n", "\n", " 3156 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 3157 \n", "weight .783602487218187 \n", "weight_spss .640751753798312 \n", "form 1 \n", "... ... \n", "starttime 12/31/2019 19:40:28 \n", "endtime 12/31/2019 20:17:50 \n", "duration 2242 \n", "pop_density_public 3700 \n", "flag_state 0 \n", "\n", " 3157 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 3158 \n", "weight .792508744423736 \n", "weight_spss .648034400315289 \n", "form 2 \n", "... ... \n", "starttime 12/31/2019 19:40:59 \n", "endtime 12/31/2019 20:13:32 \n", "duration 1953 \n", "pop_density_public 2000 \n", "flag_state 0 \n", "\n", " 3158 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 3159 \n", "weight \n", "weight_spss \n", "form 2 \n", "... ... \n", "starttime 12/31/2019 19:41:26 \n", "endtime 12/31/2019 20:22:45 \n", "duration 2479 \n", "pop_density_public \n", "flag_state \n", "\n", " 3159 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 3160 \n", "weight \n", "weight_spss \n", "form 1 \n", "... ... \n", "starttime 12/31/2019 19:42:13 \n", "endtime 12/31/2019 20:28:23 \n", "duration 2770 \n", "pop_density_public \n", "flag_state \n", "\n", " 3160 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 3161 \n", "weight \n", "weight_spss \n", "form 2 \n", "... ... \n", "starttime 12/31/2019 19:38:13 \n", "endtime 12/31/2019 20:24:56 \n", "duration 2803 \n", "pop_density_public \n", "flag_state \n", "\n", " 3161 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 3162 \n", "weight 7.03646496881757 \n", "weight_spss 5.75371740500213 \n", "form 2 \n", "... ... \n", "starttime 12/31/2019 20:14:34 \n", "endtime 12/31/2019 20:53:50 \n", "duration 2356 \n", "pop_density_public 1800 \n", "flag_state 0 \n", "\n", " 3162 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 3163 \n", "weight .892833236147303 \n", "weight_spss .73006973719765 \n", "form 2 \n", "... ... \n", "starttime 12/31/2019 20:10:04 \n", "endtime 12/31/2019 20:29:15 \n", "duration 1151 \n", "pop_density_public 200 \n", "flag_state 0 \n", "\n", " 3163 \\\n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 3164 \n", "weight 1.58161278448241 \n", "weight_spss 1.29328477387127 \n", "form 2 \n", "... ... \n", "starttime 12/31/2019 22:10:05 \n", "endtime 12/31/2019 22:52:37 \n", "duration 2552 \n", "pop_density_public 6600 \n", "flag_state 0 \n", "\n", " 3164 \n", "version ANES 2019 Pilot Study version 20200204 \n", "caseid 3165 \n", "weight .809576969671362 \n", "weight_spss .661991088100273 \n", "form 1 \n", "... ... \n", "starttime 12/31/2019 23:27:51 \n", "endtime 1/1/2020 0:21:59 \n", "duration 3248 \n", "pop_density_public 1 \n", "flag_state 0 \n", "\n", "[900 rows x 3165 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.T # display a snapshot of raw data -- the first column here shows your variables, \n", " # the other colums are responses" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. get category counts for a categorical variable\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "scrolled": false }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f3ab1ec39f1248e294d8b347aa104151", "version_major": 2, "version_minor": 0 }, "text/plain": [ "interactive(children=(Dropdown(description='variable', options=('CompletedSurveys', 'EnrollmentDate', 'FIPCoun…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from prettytable import PrettyTable\n", "from ipywidgets import interact\n", "from IPython.core.display import display, HTML\n", "from IPython.html.widgets import SelectMultiple\n", "\n", "@interact(variable=data.columns.sort_values())\n", "def categorical_table(variable='V161002'):\n", " x = PrettyTable()\n", " x.field_names = [variable, 'Count']\n", " for i, row in data.groupby(variable).size().reset_index().iterrows():\n", " x.add_row((row[variable], row[0]))\n", " display(HTML(x.get_html_string()))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. get average and spread for a continuous variable\n" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "91d143faa4794932b27007b8d2e4f968", "version_major": 2, "version_minor": 0 }, "text/plain": [ "interactive(children=(Dropdown(description='variable', options=('CompletedSurveys', 'EnrollmentDate', 'FIPCoun…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from prettytable import PrettyTable\n", "from ipywidgets import interact, widgets\n", "import numpy as np\n", "from IPython.core.display import display, HTML\n", "from matplotlib import pyplot as plt\n", "\n", "def cast(v):\n", " try:\n", " return float(v)\n", " except:\n", " return np.nan\n", "\n", "\n", "variable_select = widgets.Dropdown(options=data.columns.sort_values())\n", "\n", "drop_select = widgets.SelectMultiple(options=[])\n", "\n", "def update_drop_select(*args):\n", " drop_select.options=np.sort(data[variable_select.value].unique())\n", "\n", "variable_select.observe(update_drop_select, 'value')\n", "\n", "def printer(variable, drop_vals, drop_na, zoom=widgets.IntSlider(min=10,max=100,step=5,value=10)):\n", " df = data.copy()\n", " df[variable] = df[variable].apply(cast)\n", " \n", " df= df[[v not in drop_vals for v in df[variable]]]\n", "\n", " if drop_na:\n", " df = df[df[variable] > 0]\n", " \n", " if len(drop_vals):\n", " print('dropped values: {}'.format(drop_vals))\n", " \n", " x = PrettyTable()\n", " x.field_names = [variable, 'mean', 'standard deviation']\n", " mu = np.mean(df[variable])\n", " sigma = np.std(df[variable])\n", " \n", " result = (variable, mu, sigma)\n", " x.add_row(result)\n", " \n", " display(HTML(x.get_html_string()))\n", " plt.figure(figsize=(10,5))\n", " plt.hist(df[variable], bins=zoom)\n", " ax = plt.gca()\n", " ymin, ymax = ax.get_ylim()\n", " \n", " for val in range(-3,3):\n", " x = val*sigma+mu \n", " col='black'\n", " \n", " if val==0:\n", " ax.vlines(x,ymin,ymax, alpha=1, color='red')\n", " else:\n", " ax.vlines(x,ymin,ymax/np.abs(val)/(zoom/10), alpha=1/np.abs(val), color=col)\n", " \n", "interact(printer, drop_vals=drop_select, variable=variable_select, drop_na=True);" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "from prettytable import PrettyTable\n", "from ipywidgets import interact, widgets\n", "import numpy as np\n", "from IPython.core.display import display, HTML\n", "from matplotlib import pyplot as plt\n", "\n", "def cast(v):\n", " try:\n", " return float(v)\n", " except:\n", " return np.nan\n", "\n", "\n", "@interact(variable=data.columns.sort_values())\n", "def categorical_table(variable='V161267', zoom=widgets.IntSlider(min=10,max=100,step=5,value=10),drop_na=False):\n", " df = data.copy()\n", " df[variable] = df[variable].apply(cast)\n", " if drop_na:\n", " df = df[df[variable] > 0]\n", " \n", " x = PrettyTable()\n", " x.field_names = [variable, 'mean', 'standard deviation']\n", " mu = np.mean(df[variable])\n", " sigma = np.std(df[variable])\n", " \n", " result = (variable, mu, sigma)\n", " x.add_row(result)\n", " \n", " display(HTML(x.get_html_string()))\n", " plt.figure(figsize=(10,5))\n", " plt.hist(df[variable], bins=zoom)\n", " ax = plt.gca()\n", " ymin, ymax = ax.get_ylim()\n", " \n", " for val in range(-3,3):\n", " x = val*sigma+mu \n", " col='black'\n", " \n", " if val==0:\n", " ax.vlines(x,ymin,ymax, alpha=1, color='red')\n", " else:\n", " ax.vlines(x,ymin,ymax/np.abs(val)/(zoom/10), alpha=1/np.abs(val), color=col)\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Compare two categorical variables (or ordinal)\n", " " ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8163f5d36ceb49588c1894d29394a359", "version_major": 2, "version_minor": 0 }, "text/plain": [ "interactive(children=(Dropdown(description='dependent_variable', options=('CompletedSurveys', 'EnrollmentDate'…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from ipywidgets import interact\n", "import scipy.stats as scs\n", "from scipy.stats import chi2_contingency\n", "\n", "\n", "dependent_variable_select = widgets.Dropdown(options=data.columns.sort_values())\n", "independent_variable_select = widgets.Dropdown(options=data.columns.sort_values())\n", "\n", "dependent_drop_select = widgets.SelectMultiple(options=[])\n", "independent_drop_select = widgets.SelectMultiple(options=[])\n", "\n", "def update_dependent_drop_select(*args):\n", " dependent_drop_select.options=np.sort(data[dependent_variable_select.value].unique())\n", "\n", "\n", "def update_independent_drop_select(*args):\n", " independent_drop_select.options=np.sort(data[independent_variable_select.value].unique())\n", "\n", " \n", "dependent_variable_select.observe(update_dependent_drop_select, 'value')\n", "independent_variable_select.observe(update_independent_drop_select, 'value')\n", "\n", "def categorical_table(dependent_variable, independent_variable, dep_drop_vals, indep_drop_vals,drop_na=True):\n", " df = data.copy()\n", " \n", " if drop_na:\n", " for variable in [independent_variable, dependent_variable]:\n", " try:\n", " df[variable] = df[variable].astype(float)\n", " df = df[df[variable]>-1]\n", " except:\n", " \n", " pass\n", " \n", " df = df[[v not in dep_drop_vals for v in df[dependent_variable]]]\n", " df = df[[v not in indep_drop_vals for v in df[independent_variable]]]\n", " \n", " \n", " if len(dep_drop_vals):\n", " print('dropped dependent values: {}'.format(dep_drop_vals))\n", " \n", " if len(indep_drop_vals):\n", " print('dropped independent values: {}'.format(indep_drop_vals))\n", " \n", " cross_tab = pd.crosstab(df[dependent_variable], df[independent_variable])\n", " stats = chi2_contingency(cross_tab)\n", " print(\"chi-sq = {}, p-val = {}\".format(round(stats[0],5), round(stats[1], 5)))\n", " return cross_tab\n", "\n", "interact(categorical_table, dependent_variable=dependent_variable_select ,\n", " independent_variable=independent_variable_select, dep_drop_vals=dependent_drop_select, \n", " indep_drop_vals=independent_drop_select, drop_na=True);\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Compare a categorical with a numeric/ordinal" ] }, { "cell_type": "code", "execution_count": 80, "metadata": { "scrolled": false }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b08a6eb6d35b47119aec633a39e91b95", "version_major": 2, "version_minor": 0 }, "text/plain": [ "interactive(children=(Dropdown(description='categorical_variable', options=('CompletedSurveys', 'EnrollmentDat…" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from ipywidgets import interact\n", "import scipy.stats as scs\n", "import statsmodels.api as sm\n", "from statsmodels.formula.api import ols\n", "\n", "\n", "categorical_variable_select = widgets.Dropdown(options=data.columns.sort_values())\n", "numeric_variable_select = widgets.Dropdown(options=data.columns.sort_values())\n", "\n", "categorical_drop_select = widgets.SelectMultiple(options=[])\n", "numeric_drop_select = widgets.SelectMultiple(options=[])\n", "\n", "def update_categorical_drop_select(*args):\n", " categorical_drop_select.options=np.sort(data[categorical_variable_select.value].unique())\n", "\n", "def update_numeric_drop_select(*args):\n", " numeric_drop_select.options=np.sort(data[numeric_variable_select.value].unique())\n", "\n", " \n", "categorical_variable_select.observe(update_categorical_drop_select, 'value')\n", "numeric_variable_select.observe(update_numeric_drop_select, 'value')\n", "\n", "\n", "def categorical_table(categorical_variable, numeric_variable,cat_drop_vals, num_drop_vals, drop_na=True):\n", " df = data.copy()\n", " \n", " if drop_na:\n", " for variable in [categorical_variable, numeric_variable]:\n", " try:\n", " df[variable] = df[variable].astype(float)\n", " df = df[df[variable]>-1]\n", " except:\n", " pass\n", " \n", " if len(df[categorical_variable].unique())>15:\n", " print(\"PLEASE CHOOSE A CATEGORICAL VARIABLE\")\n", " return\n", " \n", " try:\n", " df[numeric_variable].astype(float)\n", " except:\n", " print(\"PLEASE CHOOSE A NUMERIC VARIABLE\")\n", " return\n", "\n", " \n", " df = df[[v not in cat_drop_vals for v in df[categorical_variable]]]\n", " df = df[[v not in num_drop_vals for v in df[numeric_variable]]]\n", " \n", " \n", " if len(cat_drop_vals):\n", " print('dropped dependent values: {}'.format(cat_drop_vals))\n", " \n", " if len(num_drop_vals):\n", " print('dropped independent values: {}'.format(num_drop_vals))\n", " \n", " \n", " \n", " plt.figure(figsize=(10,5))\n", " ax=plt.gca()\n", " for c in np.sort(df[categorical_variable].unique()):\n", " dat = df[df[categorical_variable]==c]\n", " ax.hist(dat[numeric_variable], alpha=.5, bins='doane')\n", " ax.legend(np.sort(df[categorical_variable].unique()))\n", " \n", " X = df[numeric_variable]\n", " X = sm.add_constant(X)\n", " \n", " res = ols(\"{} ~ C({})\".format(numeric_variable, categorical_variable), df).fit()\n", " pw = res.t_test_pairwise(\"C({})\".format(categorical_variable))\n", " return pw.result_frame[['t','P>|t|','pvalue-hs','reject-hs']]\n", " \n", " \n", " \n", "interact(categorical_table, categorical_variable=categorical_variable_select,\n", " numeric_variable=numeric_variable_select, cat_drop_vals=categorical_drop_select, \n", " num_drop_vals=numeric_drop_select, drop_na=True)\n", "pass\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 }