{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Welcome to your Quantitative Social Sciences Analysis Toolkit!\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. load survey data into the notebook \n",
    "(run this first & run this everytime you close and reopen the notebook)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd # load a specialized piece of software that will help us with the analysis\n",
    "data = pd.read_csv('data/anes_pilot_2019.csv',low_memory=False) # read in the table of data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. display data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>3155</th>\n",
       "      <th>3156</th>\n",
       "      <th>3157</th>\n",
       "      <th>3158</th>\n",
       "      <th>3159</th>\n",
       "      <th>3160</th>\n",
       "      <th>3161</th>\n",
       "      <th>3162</th>\n",
       "      <th>3163</th>\n",
       "      <th>3164</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>version</th>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>...</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "      <td>ANES 2019 Pilot Study version 20200204</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>caseid</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>8</td>\n",
       "      <td>9</td>\n",
       "      <td>10</td>\n",
       "      <td>...</td>\n",
       "      <td>3156</td>\n",
       "      <td>3157</td>\n",
       "      <td>3158</td>\n",
       "      <td>3159</td>\n",
       "      <td>3160</td>\n",
       "      <td>3161</td>\n",
       "      <td>3162</td>\n",
       "      <td>3163</td>\n",
       "      <td>3164</td>\n",
       "      <td>3165</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weight</th>\n",
       "      <td>1.34719693063187</td>\n",
       "      <td>.780822076219216</td>\n",
       "      <td>.966366930694957</td>\n",
       "      <td>1.10348514780374</td>\n",
       "      <td>1.09069730256741</td>\n",
       "      <td>1.02140871415171</td>\n",
       "      <td>.964514474045239</td>\n",
       "      <td>.83469258858232</td>\n",
       "      <td>1.53541542020853</td>\n",
       "      <td>1.32458088383641</td>\n",
       "      <td>...</td>\n",
       "      <td>1.17827101584555</td>\n",
       "      <td>.783602487218187</td>\n",
       "      <td>.792508744423736</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>7.03646496881757</td>\n",
       "      <td>.892833236147303</td>\n",
       "      <td>1.58161278448241</td>\n",
       "      <td>.809576969671362</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>weight_spss</th>\n",
       "      <td>1.10160293017768</td>\n",
       "      <td>.638478211724453</td>\n",
       "      <td>.790198239229266</td>\n",
       "      <td>.902319805359118</td>\n",
       "      <td>.891863184309371</td>\n",
       "      <td>.835205905561853</td>\n",
       "      <td>.788683485426792</td>\n",
       "      <td>.682528129683763</td>\n",
       "      <td>1.25550918910451</td>\n",
       "      <td>1.08310978871303</td>\n",
       "      <td>...</td>\n",
       "      <td>.963472209656906</td>\n",
       "      <td>.640751753798312</td>\n",
       "      <td>.648034400315289</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>5.75371740500213</td>\n",
       "      <td>.73006973719765</td>\n",
       "      <td>1.29328477387127</td>\n",
       "      <td>.661991088100273</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>form</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>starttime</th>\n",
       "      <td>12/31/2019 18:57:33</td>\n",
       "      <td>12/21/2019 4:19:56</td>\n",
       "      <td>12/22/2019 23:03:28</td>\n",
       "      <td>12/31/2019 19:53:14</td>\n",
       "      <td>12/21/2019 4:07:09</td>\n",
       "      <td>12/21/2019 22:45:18</td>\n",
       "      <td>12/27/2019 19:16:05</td>\n",
       "      <td>12/21/2019 23:21:55</td>\n",
       "      <td>12/25/2019 5:39:51</td>\n",
       "      <td>12/28/2019 3:09:16</td>\n",
       "      <td>...</td>\n",
       "      <td>12/31/2019 19:41:53</td>\n",
       "      <td>12/31/2019 19:40:28</td>\n",
       "      <td>12/31/2019 19:40:59</td>\n",
       "      <td>12/31/2019 19:41:26</td>\n",
       "      <td>12/31/2019 19:42:13</td>\n",
       "      <td>12/31/2019 19:38:13</td>\n",
       "      <td>12/31/2019 20:14:34</td>\n",
       "      <td>12/31/2019 20:10:04</td>\n",
       "      <td>12/31/2019 22:10:05</td>\n",
       "      <td>12/31/2019 23:27:51</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>endtime</th>\n",
       "      <td>12/31/2019 19:39:49</td>\n",
       "      <td>12/21/2019 4:53:19</td>\n",
       "      <td>12/22/2019 23:41:43</td>\n",
       "      <td>12/31/2019 20:23:11</td>\n",
       "      <td>12/21/2019 4:48:50</td>\n",
       "      <td>12/22/2019 0:28:27</td>\n",
       "      <td>12/27/2019 19:45:45</td>\n",
       "      <td>12/21/2019 23:40:20</td>\n",
       "      <td>12/25/2019 5:57:21</td>\n",
       "      <td>12/28/2019 3:35:48</td>\n",
       "      <td>...</td>\n",
       "      <td>12/31/2019 20:08:20</td>\n",
       "      <td>12/31/2019 20:17:50</td>\n",
       "      <td>12/31/2019 20:13:32</td>\n",
       "      <td>12/31/2019 20:22:45</td>\n",
       "      <td>12/31/2019 20:28:23</td>\n",
       "      <td>12/31/2019 20:24:56</td>\n",
       "      <td>12/31/2019 20:53:50</td>\n",
       "      <td>12/31/2019 20:29:15</td>\n",
       "      <td>12/31/2019 22:52:37</td>\n",
       "      <td>1/1/2020 0:21:59</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>duration</th>\n",
       "      <td>2536</td>\n",
       "      <td>2003</td>\n",
       "      <td>2295</td>\n",
       "      <td>1797</td>\n",
       "      <td>2501</td>\n",
       "      <td>6189</td>\n",
       "      <td>1780</td>\n",
       "      <td>1105</td>\n",
       "      <td>1050</td>\n",
       "      <td>1592</td>\n",
       "      <td>...</td>\n",
       "      <td>1587</td>\n",
       "      <td>2242</td>\n",
       "      <td>1953</td>\n",
       "      <td>2479</td>\n",
       "      <td>2770</td>\n",
       "      <td>2803</td>\n",
       "      <td>2356</td>\n",
       "      <td>1151</td>\n",
       "      <td>2552</td>\n",
       "      <td>3248</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pop_density_public</th>\n",
       "      <td>1520</td>\n",
       "      <td>1800</td>\n",
       "      <td>70</td>\n",
       "      <td>7600</td>\n",
       "      <td>4430</td>\n",
       "      <td>11900</td>\n",
       "      <td>700</td>\n",
       "      <td>45000</td>\n",
       "      <td>5700</td>\n",
       "      <td>120</td>\n",
       "      <td>...</td>\n",
       "      <td>400</td>\n",
       "      <td>3700</td>\n",
       "      <td>2000</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>1800</td>\n",
       "      <td>200</td>\n",
       "      <td>6600</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>flag_state</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>900 rows × 3165 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                      0     \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                   1   \n",
       "weight                                    1.34719693063187   \n",
       "weight_spss                               1.10160293017768   \n",
       "form                                                     1   \n",
       "...                                                    ...   \n",
       "starttime                              12/31/2019 18:57:33   \n",
       "endtime                                12/31/2019 19:39:49   \n",
       "duration                                              2536   \n",
       "pop_density_public                                    1520   \n",
       "flag_state                                               0   \n",
       "\n",
       "                                                      1     \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                   2   \n",
       "weight                                    .780822076219216   \n",
       "weight_spss                               .638478211724453   \n",
       "form                                                     1   \n",
       "...                                                    ...   \n",
       "starttime                               12/21/2019 4:19:56   \n",
       "endtime                                 12/21/2019 4:53:19   \n",
       "duration                                              2003   \n",
       "pop_density_public                                    1800   \n",
       "flag_state                                               0   \n",
       "\n",
       "                                                      2     \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                   3   \n",
       "weight                                    .966366930694957   \n",
       "weight_spss                               .790198239229266   \n",
       "form                                                     1   \n",
       "...                                                    ...   \n",
       "starttime                              12/22/2019 23:03:28   \n",
       "endtime                                12/22/2019 23:41:43   \n",
       "duration                                              2295   \n",
       "pop_density_public                                      70   \n",
       "flag_state                                               0   \n",
       "\n",
       "                                                      3     \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                   4   \n",
       "weight                                    1.10348514780374   \n",
       "weight_spss                               .902319805359118   \n",
       "form                                                     2   \n",
       "...                                                    ...   \n",
       "starttime                              12/31/2019 19:53:14   \n",
       "endtime                                12/31/2019 20:23:11   \n",
       "duration                                              1797   \n",
       "pop_density_public                                    7600   \n",
       "flag_state                                               0   \n",
       "\n",
       "                                                      4     \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                   5   \n",
       "weight                                    1.09069730256741   \n",
       "weight_spss                               .891863184309371   \n",
       "form                                                     2   \n",
       "...                                                    ...   \n",
       "starttime                               12/21/2019 4:07:09   \n",
       "endtime                                 12/21/2019 4:48:50   \n",
       "duration                                              2501   \n",
       "pop_density_public                                    4430   \n",
       "flag_state                                               0   \n",
       "\n",
       "                                                      5     \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                   6   \n",
       "weight                                    1.02140871415171   \n",
       "weight_spss                               .835205905561853   \n",
       "form                                                     2   \n",
       "...                                                    ...   \n",
       "starttime                              12/21/2019 22:45:18   \n",
       "endtime                                 12/22/2019 0:28:27   \n",
       "duration                                              6189   \n",
       "pop_density_public                                   11900   \n",
       "flag_state                                               0   \n",
       "\n",
       "                                                      6     \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                   7   \n",
       "weight                                    .964514474045239   \n",
       "weight_spss                               .788683485426792   \n",
       "form                                                     2   \n",
       "...                                                    ...   \n",
       "starttime                              12/27/2019 19:16:05   \n",
       "endtime                                12/27/2019 19:45:45   \n",
       "duration                                              1780   \n",
       "pop_density_public                                     700   \n",
       "flag_state                                               0   \n",
       "\n",
       "                                                      7     \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                   8   \n",
       "weight                                     .83469258858232   \n",
       "weight_spss                               .682528129683763   \n",
       "form                                                     1   \n",
       "...                                                    ...   \n",
       "starttime                              12/21/2019 23:21:55   \n",
       "endtime                                12/21/2019 23:40:20   \n",
       "duration                                              1105   \n",
       "pop_density_public                                   45000   \n",
       "flag_state                                               0   \n",
       "\n",
       "                                                      8     \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                   9   \n",
       "weight                                    1.53541542020853   \n",
       "weight_spss                               1.25550918910451   \n",
       "form                                                     1   \n",
       "...                                                    ...   \n",
       "starttime                               12/25/2019 5:39:51   \n",
       "endtime                                 12/25/2019 5:57:21   \n",
       "duration                                              1050   \n",
       "pop_density_public                                    5700   \n",
       "flag_state                                               0   \n",
       "\n",
       "                                                      9     ...  \\\n",
       "version             ANES 2019 Pilot Study version 20200204  ...   \n",
       "caseid                                                  10  ...   \n",
       "weight                                    1.32458088383641  ...   \n",
       "weight_spss                               1.08310978871303  ...   \n",
       "form                                                     1  ...   \n",
       "...                                                    ...  ...   \n",
       "starttime                               12/28/2019 3:09:16  ...   \n",
       "endtime                                 12/28/2019 3:35:48  ...   \n",
       "duration                                              1592  ...   \n",
       "pop_density_public                                     120  ...   \n",
       "flag_state                                               0  ...   \n",
       "\n",
       "                                                      3155  \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                3156   \n",
       "weight                                    1.17827101584555   \n",
       "weight_spss                               .963472209656906   \n",
       "form                                                     2   \n",
       "...                                                    ...   \n",
       "starttime                              12/31/2019 19:41:53   \n",
       "endtime                                12/31/2019 20:08:20   \n",
       "duration                                              1587   \n",
       "pop_density_public                                     400   \n",
       "flag_state                                               0   \n",
       "\n",
       "                                                      3156  \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                3157   \n",
       "weight                                    .783602487218187   \n",
       "weight_spss                               .640751753798312   \n",
       "form                                                     1   \n",
       "...                                                    ...   \n",
       "starttime                              12/31/2019 19:40:28   \n",
       "endtime                                12/31/2019 20:17:50   \n",
       "duration                                              2242   \n",
       "pop_density_public                                    3700   \n",
       "flag_state                                               0   \n",
       "\n",
       "                                                      3157  \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                3158   \n",
       "weight                                    .792508744423736   \n",
       "weight_spss                               .648034400315289   \n",
       "form                                                     2   \n",
       "...                                                    ...   \n",
       "starttime                              12/31/2019 19:40:59   \n",
       "endtime                                12/31/2019 20:13:32   \n",
       "duration                                              1953   \n",
       "pop_density_public                                    2000   \n",
       "flag_state                                               0   \n",
       "\n",
       "                                                      3158  \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                3159   \n",
       "weight                                                       \n",
       "weight_spss                                                  \n",
       "form                                                     2   \n",
       "...                                                    ...   \n",
       "starttime                              12/31/2019 19:41:26   \n",
       "endtime                                12/31/2019 20:22:45   \n",
       "duration                                              2479   \n",
       "pop_density_public                                           \n",
       "flag_state                                                   \n",
       "\n",
       "                                                      3159  \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                3160   \n",
       "weight                                                       \n",
       "weight_spss                                                  \n",
       "form                                                     1   \n",
       "...                                                    ...   \n",
       "starttime                              12/31/2019 19:42:13   \n",
       "endtime                                12/31/2019 20:28:23   \n",
       "duration                                              2770   \n",
       "pop_density_public                                           \n",
       "flag_state                                                   \n",
       "\n",
       "                                                      3160  \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                3161   \n",
       "weight                                                       \n",
       "weight_spss                                                  \n",
       "form                                                     2   \n",
       "...                                                    ...   \n",
       "starttime                              12/31/2019 19:38:13   \n",
       "endtime                                12/31/2019 20:24:56   \n",
       "duration                                              2803   \n",
       "pop_density_public                                           \n",
       "flag_state                                                   \n",
       "\n",
       "                                                      3161  \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                3162   \n",
       "weight                                    7.03646496881757   \n",
       "weight_spss                               5.75371740500213   \n",
       "form                                                     2   \n",
       "...                                                    ...   \n",
       "starttime                              12/31/2019 20:14:34   \n",
       "endtime                                12/31/2019 20:53:50   \n",
       "duration                                              2356   \n",
       "pop_density_public                                    1800   \n",
       "flag_state                                               0   \n",
       "\n",
       "                                                      3162  \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                3163   \n",
       "weight                                    .892833236147303   \n",
       "weight_spss                                .73006973719765   \n",
       "form                                                     2   \n",
       "...                                                    ...   \n",
       "starttime                              12/31/2019 20:10:04   \n",
       "endtime                                12/31/2019 20:29:15   \n",
       "duration                                              1151   \n",
       "pop_density_public                                     200   \n",
       "flag_state                                               0   \n",
       "\n",
       "                                                      3163  \\\n",
       "version             ANES 2019 Pilot Study version 20200204   \n",
       "caseid                                                3164   \n",
       "weight                                    1.58161278448241   \n",
       "weight_spss                               1.29328477387127   \n",
       "form                                                     2   \n",
       "...                                                    ...   \n",
       "starttime                              12/31/2019 22:10:05   \n",
       "endtime                                12/31/2019 22:52:37   \n",
       "duration                                              2552   \n",
       "pop_density_public                                    6600   \n",
       "flag_state                                               0   \n",
       "\n",
       "                                                      3164  \n",
       "version             ANES 2019 Pilot Study version 20200204  \n",
       "caseid                                                3165  \n",
       "weight                                    .809576969671362  \n",
       "weight_spss                               .661991088100273  \n",
       "form                                                     1  \n",
       "...                                                    ...  \n",
       "starttime                              12/31/2019 23:27:51  \n",
       "endtime                                   1/1/2020 0:21:59  \n",
       "duration                                              3248  \n",
       "pop_density_public                                       1  \n",
       "flag_state                                               0  \n",
       "\n",
       "[900 rows x 3165 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.T # display a snapshot of raw data -- the first column here shows your variables, \n",
    "       # the other colums are responses"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. get category counts for a categorical variable\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f3ab1ec39f1248e294d8b347aa104151",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "interactive(children=(Dropdown(description='variable', options=('CompletedSurveys', 'EnrollmentDate', 'FIPCoun…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from prettytable import PrettyTable\n",
    "from ipywidgets import interact\n",
    "from IPython.core.display import display, HTML\n",
    "from IPython.html.widgets import SelectMultiple\n",
    "\n",
    "@interact(variable=data.columns.sort_values())\n",
    "def categorical_table(variable='V161002'):\n",
    "    x = PrettyTable()\n",
    "    x.field_names = [variable, 'Count']\n",
    "    for i, row in data.groupby(variable).size().reset_index().iterrows():\n",
    "        x.add_row((row[variable], row[0]))\n",
    "    display(HTML(x.get_html_string()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. get average and spread for a continuous variable\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "91d143faa4794932b27007b8d2e4f968",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "interactive(children=(Dropdown(description='variable', options=('CompletedSurveys', 'EnrollmentDate', 'FIPCoun…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from prettytable import PrettyTable\n",
    "from ipywidgets import interact, widgets\n",
    "import numpy as np\n",
    "from IPython.core.display import display, HTML\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
    "def cast(v):\n",
    "    try:\n",
    "        return float(v)\n",
    "    except:\n",
    "        return np.nan\n",
    "\n",
    "\n",
    "variable_select = widgets.Dropdown(options=data.columns.sort_values())\n",
    "\n",
    "drop_select = widgets.SelectMultiple(options=[])\n",
    "\n",
    "def update_drop_select(*args):\n",
    "    drop_select.options=np.sort(data[variable_select.value].unique())\n",
    "\n",
    "variable_select.observe(update_drop_select, 'value')\n",
    "\n",
    "def printer(variable, drop_vals, drop_na, zoom=widgets.IntSlider(min=10,max=100,step=5,value=10)):\n",
    "    df = data.copy()\n",
    "    df[variable] = df[variable].apply(cast)\n",
    "    \n",
    "    df= df[[v not in drop_vals for v in df[variable]]]\n",
    "\n",
    "    if drop_na:\n",
    "        df = df[df[variable] > 0]\n",
    "    \n",
    "    if len(drop_vals):\n",
    "        print('dropped values: {}'.format(drop_vals))\n",
    "        \n",
    "    x = PrettyTable()\n",
    "    x.field_names = [variable, 'mean', 'standard deviation']\n",
    "    mu = np.mean(df[variable])\n",
    "    sigma = np.std(df[variable])\n",
    "    \n",
    "    result = (variable, mu, sigma)\n",
    "    x.add_row(result)\n",
    "    \n",
    "    display(HTML(x.get_html_string()))\n",
    "    plt.figure(figsize=(10,5))\n",
    "    plt.hist(df[variable], bins=zoom)\n",
    "    ax = plt.gca()\n",
    "    ymin, ymax = ax.get_ylim()\n",
    "    \n",
    "    for val in range(-3,3):\n",
    "        x = val*sigma+mu \n",
    "        col='black'\n",
    "        \n",
    "        if val==0:\n",
    "            ax.vlines(x,ymin,ymax, alpha=1, color='red')\n",
    "        else:\n",
    "            ax.vlines(x,ymin,ymax/np.abs(val)/(zoom/10), alpha=1/np.abs(val), color=col)\n",
    "            \n",
    "interact(printer, drop_vals=drop_select, variable=variable_select, drop_na=True);"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "from prettytable import PrettyTable\n",
    "from ipywidgets import interact, widgets\n",
    "import numpy as np\n",
    "from IPython.core.display import display, HTML\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
    "def cast(v):\n",
    "    try:\n",
    "        return float(v)\n",
    "    except:\n",
    "        return np.nan\n",
    "\n",
    "\n",
    "@interact(variable=data.columns.sort_values())\n",
    "def categorical_table(variable='V161267', zoom=widgets.IntSlider(min=10,max=100,step=5,value=10),drop_na=False):\n",
    "    df = data.copy()\n",
    "    df[variable] = df[variable].apply(cast)\n",
    "    if drop_na:\n",
    "        df = df[df[variable] > 0]\n",
    "    \n",
    "    x = PrettyTable()\n",
    "    x.field_names = [variable, 'mean', 'standard deviation']\n",
    "    mu = np.mean(df[variable])\n",
    "    sigma = np.std(df[variable])\n",
    "    \n",
    "    result = (variable, mu, sigma)\n",
    "    x.add_row(result)\n",
    "    \n",
    "    display(HTML(x.get_html_string()))\n",
    "    plt.figure(figsize=(10,5))\n",
    "    plt.hist(df[variable], bins=zoom)\n",
    "    ax = plt.gca()\n",
    "    ymin, ymax = ax.get_ylim()\n",
    "    \n",
    "    for val in range(-3,3):\n",
    "        x = val*sigma+mu \n",
    "        col='black'\n",
    "        \n",
    "        if val==0:\n",
    "            ax.vlines(x,ymin,ymax, alpha=1, color='red')\n",
    "        else:\n",
    "            ax.vlines(x,ymin,ymax/np.abs(val)/(zoom/10), alpha=1/np.abs(val), color=col)\n",
    "            "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Compare two categorical variables (or ordinal)\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8163f5d36ceb49588c1894d29394a359",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "interactive(children=(Dropdown(description='dependent_variable', options=('CompletedSurveys', 'EnrollmentDate'…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from ipywidgets import interact\n",
    "import scipy.stats as scs\n",
    "from scipy.stats import chi2_contingency\n",
    "\n",
    "\n",
    "dependent_variable_select = widgets.Dropdown(options=data.columns.sort_values())\n",
    "independent_variable_select = widgets.Dropdown(options=data.columns.sort_values())\n",
    "\n",
    "dependent_drop_select = widgets.SelectMultiple(options=[])\n",
    "independent_drop_select = widgets.SelectMultiple(options=[])\n",
    "\n",
    "def update_dependent_drop_select(*args):\n",
    "    dependent_drop_select.options=np.sort(data[dependent_variable_select.value].unique())\n",
    "\n",
    "\n",
    "def update_independent_drop_select(*args):\n",
    "    independent_drop_select.options=np.sort(data[independent_variable_select.value].unique())\n",
    "\n",
    "    \n",
    "dependent_variable_select.observe(update_dependent_drop_select, 'value')\n",
    "independent_variable_select.observe(update_independent_drop_select, 'value')\n",
    "\n",
    "def categorical_table(dependent_variable, independent_variable, dep_drop_vals, indep_drop_vals,drop_na=True):\n",
    "    df = data.copy()\n",
    "    \n",
    "    if drop_na:\n",
    "        for variable in [independent_variable, dependent_variable]:\n",
    "            try:\n",
    "                df[variable] = df[variable].astype(float)\n",
    "                df = df[df[variable]>-1]\n",
    "            except:\n",
    "                \n",
    "                pass\n",
    "            \n",
    "    df = df[[v not in dep_drop_vals for v in df[dependent_variable]]]\n",
    "    df = df[[v not in indep_drop_vals for v in df[independent_variable]]]\n",
    "            \n",
    "        \n",
    "    if len(dep_drop_vals):\n",
    "        print('dropped dependent values: {}'.format(dep_drop_vals))\n",
    "        \n",
    "    if len(indep_drop_vals):\n",
    "        print('dropped independent values: {}'.format(indep_drop_vals))\n",
    "        \n",
    "    cross_tab = pd.crosstab(df[dependent_variable], df[independent_variable])\n",
    "    stats =  chi2_contingency(cross_tab)\n",
    "    print(\"chi-sq = {}, p-val = {}\".format(round(stats[0],5), round(stats[1], 5)))\n",
    "    return cross_tab\n",
    "\n",
    "interact(categorical_table, dependent_variable=dependent_variable_select ,\n",
    "         independent_variable=independent_variable_select, dep_drop_vals=dependent_drop_select, \n",
    "         indep_drop_vals=independent_drop_select, drop_na=True);\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Compare a categorical with a numeric/ordinal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b08a6eb6d35b47119aec633a39e91b95",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "interactive(children=(Dropdown(description='categorical_variable', options=('CompletedSurveys', 'EnrollmentDat…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from ipywidgets import interact\n",
    "import scipy.stats as scs\n",
    "import statsmodels.api as sm\n",
    "from statsmodels.formula.api import ols\n",
    "\n",
    "\n",
    "categorical_variable_select = widgets.Dropdown(options=data.columns.sort_values())\n",
    "numeric_variable_select = widgets.Dropdown(options=data.columns.sort_values())\n",
    "\n",
    "categorical_drop_select = widgets.SelectMultiple(options=[])\n",
    "numeric_drop_select = widgets.SelectMultiple(options=[])\n",
    "\n",
    "def update_categorical_drop_select(*args):\n",
    "    categorical_drop_select.options=np.sort(data[categorical_variable_select.value].unique())\n",
    "\n",
    "def update_numeric_drop_select(*args):\n",
    "    numeric_drop_select.options=np.sort(data[numeric_variable_select.value].unique())\n",
    "\n",
    "    \n",
    "categorical_variable_select.observe(update_categorical_drop_select, 'value')\n",
    "numeric_variable_select.observe(update_numeric_drop_select, 'value')\n",
    "\n",
    "\n",
    "def categorical_table(categorical_variable, numeric_variable,cat_drop_vals, num_drop_vals, drop_na=True):\n",
    "    df = data.copy()\n",
    "    \n",
    "    if drop_na:\n",
    "        for variable in [categorical_variable, numeric_variable]:\n",
    "            try:\n",
    "                df[variable] = df[variable].astype(float)\n",
    "                df = df[df[variable]>-1]\n",
    "            except:\n",
    "                pass\n",
    "        \n",
    "    if len(df[categorical_variable].unique())>15:\n",
    "        print(\"PLEASE CHOOSE A CATEGORICAL VARIABLE\")\n",
    "        return\n",
    "    \n",
    "    try:\n",
    "        df[numeric_variable].astype(float)\n",
    "    except:\n",
    "        print(\"PLEASE CHOOSE A NUMERIC VARIABLE\")\n",
    "        return\n",
    "\n",
    "    \n",
    "    df = df[[v not in cat_drop_vals for v in df[categorical_variable]]]\n",
    "    df = df[[v not in num_drop_vals for v in df[numeric_variable]]]\n",
    "            \n",
    "        \n",
    "    if len(cat_drop_vals):\n",
    "        print('dropped dependent values: {}'.format(cat_drop_vals))\n",
    "        \n",
    "    if len(num_drop_vals):\n",
    "        print('dropped independent values: {}'.format(num_drop_vals))\n",
    "        \n",
    "    \n",
    "    \n",
    "    plt.figure(figsize=(10,5))\n",
    "    ax=plt.gca()\n",
    "    for c in np.sort(df[categorical_variable].unique()):\n",
    "        dat = df[df[categorical_variable]==c]\n",
    "        ax.hist(dat[numeric_variable], alpha=.5, bins='doane')\n",
    "    ax.legend(np.sort(df[categorical_variable].unique()))\n",
    "    \n",
    "    X = df[numeric_variable]\n",
    "    X = sm.add_constant(X)\n",
    "    \n",
    "    res = ols(\"{} ~ C({})\".format(numeric_variable, categorical_variable), df).fit()\n",
    "    pw = res.t_test_pairwise(\"C({})\".format(categorical_variable))\n",
    "    return pw.result_frame[['t','P>|t|','pvalue-hs','reject-hs']]\n",
    "    \n",
    "    \n",
    "    \n",
    "interact(categorical_table, categorical_variable=categorical_variable_select,\n",
    "         numeric_variable=numeric_variable_select, cat_drop_vals=categorical_drop_select, \n",
    "         num_drop_vals=numeric_drop_select, drop_na=True)\n",
    "pass\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}