{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bite Size Bayes\n",
    "\n",
    "\n",
    "Copyright 2020 Allen B. Downey\n",
    "\n",
    "MIT License: https://opensource.org/licenses/MIT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The dataset includes variables I selected from the General Social Survey, available from this project on the GSS site: https://gssdataexplorer.norc.org/projects/54786\n",
    "\n",
    "I also store the data in the GitHub repository for this book; the following cell downloads it, if necessary."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the data file\n",
    "\n",
    "import os\n",
    "\n",
    "if not os.path.exists('gss_bayes.tar.gz'):\n",
    "    !wget https://github.com/AllenDowney/BiteSizeBayes/raw/master/gss_bayes.tar.gz\n",
    "    !tar -xzf gss_bayes.tar.gz"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`utils.py` provides `read_stata`, which reads the data from the Stata format."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>year</th>\n",
       "      <th>relig</th>\n",
       "      <th>srcbelt</th>\n",
       "      <th>region</th>\n",
       "      <th>adults</th>\n",
       "      <th>wtssall</th>\n",
       "      <th>ballot</th>\n",
       "      <th>cohort</th>\n",
       "      <th>feminist</th>\n",
       "      <th>polviews</th>\n",
       "      <th>partyid</th>\n",
       "      <th>race</th>\n",
       "      <th>sex</th>\n",
       "      <th>educ</th>\n",
       "      <th>age</th>\n",
       "      <th>indus10</th>\n",
       "      <th>occ10</th>\n",
       "      <th>caseid</th>\n",
       "      <th>realinc</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>caseid</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1972</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0.4446</td>\n",
       "      <td>0</td>\n",
       "      <td>1949</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>16</td>\n",
       "      <td>23</td>\n",
       "      <td>5170</td>\n",
       "      <td>520</td>\n",
       "      <td>1</td>\n",
       "      <td>18951.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1972</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>0.8893</td>\n",
       "      <td>0</td>\n",
       "      <td>1902</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>10</td>\n",
       "      <td>70</td>\n",
       "      <td>6470</td>\n",
       "      <td>7700</td>\n",
       "      <td>2</td>\n",
       "      <td>24366.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1972</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>0.8893</td>\n",
       "      <td>0</td>\n",
       "      <td>1924</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>12</td>\n",
       "      <td>48</td>\n",
       "      <td>7070</td>\n",
       "      <td>4920</td>\n",
       "      <td>3</td>\n",
       "      <td>24366.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1972</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>0.8893</td>\n",
       "      <td>0</td>\n",
       "      <td>1945</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>17</td>\n",
       "      <td>27</td>\n",
       "      <td>5170</td>\n",
       "      <td>800</td>\n",
       "      <td>4</td>\n",
       "      <td>30458.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1972</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>0.8893</td>\n",
       "      <td>0</td>\n",
       "      <td>1911</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>12</td>\n",
       "      <td>61</td>\n",
       "      <td>6680</td>\n",
       "      <td>5020</td>\n",
       "      <td>5</td>\n",
       "      <td>50763.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        year  relig  srcbelt  region  adults  wtssall  ballot  cohort  \\\n",
       "caseid                                                                  \n",
       "1       1972      3        3       3       1   0.4446       0    1949   \n",
       "2       1972      2        3       3       2   0.8893       0    1902   \n",
       "3       1972      1        3       3       2   0.8893       0    1924   \n",
       "4       1972      5        3       3       2   0.8893       0    1945   \n",
       "5       1972      1        3       3       2   0.8893       0    1911   \n",
       "\n",
       "        feminist  polviews  partyid  race  sex  educ  age  indus10  occ10  \\\n",
       "caseid                                                                      \n",
       "1              0         0        2     1    2    16   23     5170    520   \n",
       "2              0         0        1     1    1    10   70     6470   7700   \n",
       "3              0         0        3     1    2    12   48     7070   4920   \n",
       "4              0         0        1     1    2    17   27     5170    800   \n",
       "5              0         0        0     1    2    12   61     6680   5020   \n",
       "\n",
       "        caseid  realinc  \n",
       "caseid                   \n",
       "1            1  18951.0  \n",
       "2            2  24366.0  \n",
       "3            3  24366.0  \n",
       "4            4  30458.0  \n",
       "5            5  50763.0  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from utils import read_stata\n",
    "\n",
    "gss = read_stata('GSS.dct', 'GSS.dat')\n",
    "gss.rename(columns={'id_': 'caseid'}, inplace=True)\n",
    "gss.index = gss['caseid']\n",
    "gss.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def replace_invalid(series, bad_vals, replacement=np.nan):\n",
    "    \"\"\"Replace invalid values with NaN\n",
    "\n",
    "    Modifies series in place.\n",
    "\n",
    "    series: Pandas Series\n",
    "    bad_vals: list of values to replace\n",
    "    replacement: value to replace\n",
    "    \"\"\"\n",
    "    series.replace(bad_vals, replacement, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The following cell replaces invalid responses for the variables we'll use."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "replace_invalid(gss['feminist'], [0, 8, 9])\n",
    "replace_invalid(gss['polviews'], [0, 8, 9])\n",
    "replace_invalid(gss['partyid'], [8, 9])\n",
    "replace_invalid(gss['indus10'], [0, 9997, 9999])\n",
    "replace_invalid(gss['age'], [0, 98, 99])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def values(series):\n",
    "    \"\"\"Make a series of values and the number of times they appear.\n",
    "    \n",
    "    series: Pandas Series\n",
    "    \n",
    "    returns: Pandas Series\n",
    "    \"\"\"\n",
    "    return series.value_counts(dropna=False).sort_index()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### feminist\n",
    "\n",
    "https://gssdataexplorer.norc.org/variables/1698/vshow\n",
    "\n",
    "This question was only asked during one year, so we're limited to a small number of responses."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0      298\n",
       "2.0     1083\n",
       "NaN    61085\n",
       "Name: feminist, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "values(gss['feminist'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### polviews\n",
    "\n",
    "https://gssdataexplorer.norc.org/variables/178/vshow\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0     1560\n",
       "2.0     6236\n",
       "3.0     6754\n",
       "4.0    20515\n",
       "5.0     8407\n",
       "6.0     7876\n",
       "7.0     1733\n",
       "NaN     9385\n",
       "Name: polviews, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "values(gss['polviews'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### partyid\n",
    "\n",
    "https://gssdataexplorer.norc.org/variables/141/vshow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0     9999\n",
       "1.0    12942\n",
       "2.0     7485\n",
       "3.0     9474\n",
       "4.0     5462\n",
       "5.0     9661\n",
       "6.0     6063\n",
       "7.0      995\n",
       "NaN      385\n",
       "Name: partyid, dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "values(gss['partyid'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### race\n",
    "\n",
    "https://gssdataexplorer.norc.org/variables/82/vshow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    50340\n",
       "2     8802\n",
       "3     3324\n",
       "Name: race, dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "values(gss['race'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### sex\n",
    "\n",
    "https://gssdataexplorer.norc.org/variables/81/vshow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    27562\n",
       "2    34904\n",
       "Name: sex, dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "values(gss['sex'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### age\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "18.0     219\n",
       "19.0     835\n",
       "20.0     870\n",
       "21.0     987\n",
       "22.0    1042\n",
       "        ... \n",
       "86.0     172\n",
       "87.0     143\n",
       "88.0     113\n",
       "89.0     335\n",
       "NaN      221\n",
       "Name: age, Length: 73, dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "values(gss['age'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### indus10\n",
    "\n",
    "https://gssdataexplorer.norc.org/variables/17/vshow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "170.0      458\n",
       "180.0      444\n",
       "190.0       37\n",
       "270.0       69\n",
       "280.0       36\n",
       "          ... \n",
       "9770.0      13\n",
       "9780.0       8\n",
       "9790.0      53\n",
       "9870.0      22\n",
       "NaN       4704\n",
       "Name: indus10, Length: 271, dtype: int64"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "values(gss['indus10'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Select subset\n",
    "\n",
    "Here's the subset of the data with valid responses for the variables we'll use."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(49290, 19)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "varnames = ['year', 'age', 'sex', 'polviews', 'partyid', 'indus10']\n",
    "\n",
    "valid = gss.dropna(subset=varnames)\n",
    "valid.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>year</th>\n",
       "      <th>age</th>\n",
       "      <th>sex</th>\n",
       "      <th>polviews</th>\n",
       "      <th>partyid</th>\n",
       "      <th>indus10</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>caseid</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1974</td>\n",
       "      <td>21.0</td>\n",
       "      <td>1</td>\n",
       "      <td>4.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>4970.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1974</td>\n",
       "      <td>41.0</td>\n",
       "      <td>1</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>9160.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1974</td>\n",
       "      <td>58.0</td>\n",
       "      <td>2</td>\n",
       "      <td>6.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2670.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1974</td>\n",
       "      <td>30.0</td>\n",
       "      <td>1</td>\n",
       "      <td>5.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>6870.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1974</td>\n",
       "      <td>48.0</td>\n",
       "      <td>1</td>\n",
       "      <td>5.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>7860.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        year   age  sex  polviews  partyid  indus10\n",
       "caseid                                             \n",
       "1       1974  21.0    1       4.0      2.0   4970.0\n",
       "2       1974  41.0    1       5.0      0.0   9160.0\n",
       "5       1974  58.0    2       6.0      1.0   2670.0\n",
       "6       1974  30.0    1       5.0      4.0   6870.0\n",
       "7       1974  48.0    1       5.0      4.0   7860.0"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "subset = valid[varnames]\n",
    "subset.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "subset.to_csv('gss_bayes.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-rw-rw-r-- 1 downey downey 1546290 Jan 21 10:11 gss_bayes.csv\r\n"
     ]
    }
   ],
   "source": [
    "!ls -l gss_bayes.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}