{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Bite Size Bayes\n",
"\n",
"\n",
"Copyright 2020 Allen B. Downey\n",
"\n",
"MIT License: https://opensource.org/licenses/MIT"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The dataset includes variables I selected from the General Social Survey, available from this project on the GSS site: https://gssdataexplorer.norc.org/projects/54786\n",
"\n",
"I also store the data in the GitHub repository for this book; the following cell downloads it, if necessary."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# Load the data file\n",
"\n",
"import os\n",
"\n",
"if not os.path.exists('gss_bayes.tar.gz'):\n",
" !wget https://github.com/AllenDowney/BiteSizeBayes/raw/master/gss_bayes.tar.gz\n",
" !tar -xzf gss_bayes.tar.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`utils.py` provides `read_stata`, which reads the data from the Stata format."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" year | \n",
" relig | \n",
" srcbelt | \n",
" region | \n",
" adults | \n",
" wtssall | \n",
" ballot | \n",
" cohort | \n",
" feminist | \n",
" polviews | \n",
" partyid | \n",
" race | \n",
" sex | \n",
" educ | \n",
" age | \n",
" indus10 | \n",
" occ10 | \n",
" caseid | \n",
" realinc | \n",
"
\n",
" \n",
" caseid | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 1972 | \n",
" 3 | \n",
" 3 | \n",
" 3 | \n",
" 1 | \n",
" 0.4446 | \n",
" 0 | \n",
" 1949 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 2 | \n",
" 16 | \n",
" 23 | \n",
" 5170 | \n",
" 520 | \n",
" 1 | \n",
" 18951.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 1972 | \n",
" 2 | \n",
" 3 | \n",
" 3 | \n",
" 2 | \n",
" 0.8893 | \n",
" 0 | \n",
" 1902 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 10 | \n",
" 70 | \n",
" 6470 | \n",
" 7700 | \n",
" 2 | \n",
" 24366.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 1972 | \n",
" 1 | \n",
" 3 | \n",
" 3 | \n",
" 2 | \n",
" 0.8893 | \n",
" 0 | \n",
" 1924 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" 1 | \n",
" 2 | \n",
" 12 | \n",
" 48 | \n",
" 7070 | \n",
" 4920 | \n",
" 3 | \n",
" 24366.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 1972 | \n",
" 5 | \n",
" 3 | \n",
" 3 | \n",
" 2 | \n",
" 0.8893 | \n",
" 0 | \n",
" 1945 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 2 | \n",
" 17 | \n",
" 27 | \n",
" 5170 | \n",
" 800 | \n",
" 4 | \n",
" 30458.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 1972 | \n",
" 1 | \n",
" 3 | \n",
" 3 | \n",
" 2 | \n",
" 0.8893 | \n",
" 0 | \n",
" 1911 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 12 | \n",
" 61 | \n",
" 6680 | \n",
" 5020 | \n",
" 5 | \n",
" 50763.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" year relig srcbelt region adults wtssall ballot cohort \\\n",
"caseid \n",
"1 1972 3 3 3 1 0.4446 0 1949 \n",
"2 1972 2 3 3 2 0.8893 0 1902 \n",
"3 1972 1 3 3 2 0.8893 0 1924 \n",
"4 1972 5 3 3 2 0.8893 0 1945 \n",
"5 1972 1 3 3 2 0.8893 0 1911 \n",
"\n",
" feminist polviews partyid race sex educ age indus10 occ10 \\\n",
"caseid \n",
"1 0 0 2 1 2 16 23 5170 520 \n",
"2 0 0 1 1 1 10 70 6470 7700 \n",
"3 0 0 3 1 2 12 48 7070 4920 \n",
"4 0 0 1 1 2 17 27 5170 800 \n",
"5 0 0 0 1 2 12 61 6680 5020 \n",
"\n",
" caseid realinc \n",
"caseid \n",
"1 1 18951.0 \n",
"2 2 24366.0 \n",
"3 3 24366.0 \n",
"4 4 30458.0 \n",
"5 5 50763.0 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from utils import read_stata\n",
"\n",
"gss = read_stata('GSS.dct', 'GSS.dat')\n",
"gss.rename(columns={'id_': 'caseid'}, inplace=True)\n",
"gss.index = gss['caseid']\n",
"gss.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def replace_invalid(series, bad_vals, replacement=np.nan):\n",
" \"\"\"Replace invalid values with NaN\n",
"\n",
" Modifies series in place.\n",
"\n",
" series: Pandas Series\n",
" bad_vals: list of values to replace\n",
" replacement: value to replace\n",
" \"\"\"\n",
" series.replace(bad_vals, replacement, inplace=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The following cell replaces invalid responses for the variables we'll use."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"replace_invalid(gss['feminist'], [0, 8, 9])\n",
"replace_invalid(gss['polviews'], [0, 8, 9])\n",
"replace_invalid(gss['partyid'], [8, 9])\n",
"replace_invalid(gss['indus10'], [0, 9997, 9999])\n",
"replace_invalid(gss['age'], [0, 98, 99])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def values(series):\n",
" \"\"\"Make a series of values and the number of times they appear.\n",
" \n",
" series: Pandas Series\n",
" \n",
" returns: Pandas Series\n",
" \"\"\"\n",
" return series.value_counts(dropna=False).sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### feminist\n",
"\n",
"https://gssdataexplorer.norc.org/variables/1698/vshow\n",
"\n",
"This question was only asked during one year, so we're limited to a small number of responses."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.0 298\n",
"2.0 1083\n",
"NaN 61085\n",
"Name: feminist, dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"values(gss['feminist'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### polviews\n",
"\n",
"https://gssdataexplorer.norc.org/variables/178/vshow\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.0 1560\n",
"2.0 6236\n",
"3.0 6754\n",
"4.0 20515\n",
"5.0 8407\n",
"6.0 7876\n",
"7.0 1733\n",
"NaN 9385\n",
"Name: polviews, dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"values(gss['polviews'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### partyid\n",
"\n",
"https://gssdataexplorer.norc.org/variables/141/vshow"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.0 9999\n",
"1.0 12942\n",
"2.0 7485\n",
"3.0 9474\n",
"4.0 5462\n",
"5.0 9661\n",
"6.0 6063\n",
"7.0 995\n",
"NaN 385\n",
"Name: partyid, dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"values(gss['partyid'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### race\n",
"\n",
"https://gssdataexplorer.norc.org/variables/82/vshow"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1 50340\n",
"2 8802\n",
"3 3324\n",
"Name: race, dtype: int64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"values(gss['race'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### sex\n",
"\n",
"https://gssdataexplorer.norc.org/variables/81/vshow"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"1 27562\n",
"2 34904\n",
"Name: sex, dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"values(gss['sex'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### age\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"18.0 219\n",
"19.0 835\n",
"20.0 870\n",
"21.0 987\n",
"22.0 1042\n",
" ... \n",
"86.0 172\n",
"87.0 143\n",
"88.0 113\n",
"89.0 335\n",
"NaN 221\n",
"Name: age, Length: 73, dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"values(gss['age'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### indus10\n",
"\n",
"https://gssdataexplorer.norc.org/variables/17/vshow"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"170.0 458\n",
"180.0 444\n",
"190.0 37\n",
"270.0 69\n",
"280.0 36\n",
" ... \n",
"9770.0 13\n",
"9780.0 8\n",
"9790.0 53\n",
"9870.0 22\n",
"NaN 4704\n",
"Name: indus10, Length: 271, dtype: int64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"values(gss['indus10'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Select subset\n",
"\n",
"Here's the subset of the data with valid responses for the variables we'll use."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(49290, 19)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"varnames = ['year', 'age', 'sex', 'polviews', 'partyid', 'indus10']\n",
"\n",
"valid = gss.dropna(subset=varnames)\n",
"valid.shape"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" year | \n",
" age | \n",
" sex | \n",
" polviews | \n",
" partyid | \n",
" indus10 | \n",
"
\n",
" \n",
" caseid | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 1974 | \n",
" 21.0 | \n",
" 1 | \n",
" 4.0 | \n",
" 2.0 | \n",
" 4970.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 1974 | \n",
" 41.0 | \n",
" 1 | \n",
" 5.0 | \n",
" 0.0 | \n",
" 9160.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 1974 | \n",
" 58.0 | \n",
" 2 | \n",
" 6.0 | \n",
" 1.0 | \n",
" 2670.0 | \n",
"
\n",
" \n",
" 6 | \n",
" 1974 | \n",
" 30.0 | \n",
" 1 | \n",
" 5.0 | \n",
" 4.0 | \n",
" 6870.0 | \n",
"
\n",
" \n",
" 7 | \n",
" 1974 | \n",
" 48.0 | \n",
" 1 | \n",
" 5.0 | \n",
" 4.0 | \n",
" 7860.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" year age sex polviews partyid indus10\n",
"caseid \n",
"1 1974 21.0 1 4.0 2.0 4970.0\n",
"2 1974 41.0 1 5.0 0.0 9160.0\n",
"5 1974 58.0 2 6.0 1.0 2670.0\n",
"6 1974 30.0 1 5.0 4.0 6870.0\n",
"7 1974 48.0 1 5.0 4.0 7860.0"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"subset = valid[varnames]\n",
"subset.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Save the data"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"subset.to_csv('gss_bayes.csv')"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-rw-rw-r-- 1 downey downey 1546290 Jan 21 10:11 gss_bayes.csv\r\n"
]
}
],
"source": [
"!ls -l gss_bayes.csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}