{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Internet use and religion in Europe\n",
"-----------------------------------------\n",
"\n",
"This notebook presents a quick-and-dirty analysis of the association between Internet use and religion in Europe, using data from the European Social Survey (http://www.europeansocialsurvey.org).\n",
"\n",
"Copyright 2015 Allen Downey\n",
"\n",
"MIT License: http://opensource.org/licenses/MIT"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from __future__ import print_function, division\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import statsmodels.formula.api as smf\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The following function selects the columns I need."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def select_cols(df):\n",
" cols = ['cntry', 'tvtot', 'tvpol', 'rdtot', 'rdpol', 'nwsptot', 'nwsppol', 'netuse', \n",
" 'rlgblg', 'rlgdgr', 'eduyrs', 'hinctnta', 'yrbrn', 'eisced', 'pspwght', 'pweight']\n",
" df = df[cols]\n",
" return df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Read data from Cycle 1.\n",
"\n",
"TODO: investigate the difference between hinctnt and hinctnta; is there a recode that reconciles them?"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" cntry | \n",
" tvtot | \n",
" tvpol | \n",
" rdtot | \n",
" rdpol | \n",
" nwsptot | \n",
" nwsppol | \n",
" netuse | \n",
" rlgblg | \n",
" rlgdgr | \n",
" eduyrs | \n",
" hinctnta | \n",
" yrbrn | \n",
" eisced | \n",
" pspwght | \n",
" pweight | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" AT | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 5 | \n",
" 1 | \n",
" 8 | \n",
" 11 | \n",
" 77 | \n",
" 1949 | \n",
" 0 | \n",
" 0.940933 | \n",
" 0.271488 | \n",
"
\n",
" \n",
" 1 | \n",
" AT | \n",
" 3 | \n",
" 2 | \n",
" 4 | \n",
" 1 | \n",
" 2 | \n",
" 2 | \n",
" 6 | \n",
" 2 | \n",
" 5 | \n",
" 14 | \n",
" 2 | \n",
" 1953 | \n",
" 0 | \n",
" 0.470466 | \n",
" 0.271488 | \n",
"
\n",
" \n",
" 2 | \n",
" AT | \n",
" 7 | \n",
" 3 | \n",
" 0 | \n",
" 66 | \n",
" 0 | \n",
" 66 | \n",
" 0 | \n",
" 1 | \n",
" 7 | \n",
" 9 | \n",
" 77 | \n",
" 1940 | \n",
" 0 | \n",
" 1.392155 | \n",
" 0.271488 | \n",
"
\n",
" \n",
" 3 | \n",
" AT | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 2 | \n",
" 2 | \n",
" 4 | \n",
" 1 | \n",
" 7 | \n",
" 18 | \n",
" 9 | \n",
" 1959 | \n",
" 0 | \n",
" 1.382163 | \n",
" 0.271488 | \n",
"
\n",
" \n",
" 4 | \n",
" AT | \n",
" 0 | \n",
" 66 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 66 | \n",
" 7 | \n",
" 1 | \n",
" 10 | \n",
" 15 | \n",
" 9 | \n",
" 1962 | \n",
" 0 | \n",
" 1.437766 | \n",
" 0.271488 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" cntry tvtot tvpol rdtot rdpol nwsptot nwsppol netuse rlgblg rlgdgr \\\n",
"0 AT 1 1 1 1 1 1 5 1 8 \n",
"1 AT 3 2 4 1 2 2 6 2 5 \n",
"2 AT 7 3 0 66 0 66 0 1 7 \n",
"3 AT 1 1 1 1 2 2 4 1 7 \n",
"4 AT 0 66 1 1 0 66 7 1 10 \n",
"\n",
" eduyrs hinctnta yrbrn eisced pspwght pweight \n",
"0 11 77 1949 0 0.940933 0.271488 \n",
"1 14 2 1953 0 0.470466 0.271488 \n",
"2 9 77 1940 0 1.392155 0.271488 \n",
"3 18 9 1959 0 1.382163 0.271488 \n",
"4 15 9 1962 0 1.437766 0.271488 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1 = pd.read_stata('ESS1e06_4.dta', convert_categoricals=False)\n",
"df1['hinctnta'] = df1.hinctnt\n",
"df1 = select_cols(df1)\n",
"df1.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Read data from Cycle 2."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" cntry | \n",
" tvtot | \n",
" tvpol | \n",
" rdtot | \n",
" rdpol | \n",
" nwsptot | \n",
" nwsppol | \n",
" netuse | \n",
" rlgblg | \n",
" rlgdgr | \n",
" eduyrs | \n",
" hinctnta | \n",
" yrbrn | \n",
" eisced | \n",
" pspwght | \n",
" pweight | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" AT | \n",
" 3 | \n",
" 2 | \n",
" 0 | \n",
" 66 | \n",
" 4 | \n",
" 2 | \n",
" 7 | \n",
" 1 | \n",
" 7 | \n",
" 12 | \n",
" 8 | \n",
" 1971 | \n",
" 0 | \n",
" 0.682185 | \n",
" 0.302006 | \n",
"
\n",
" \n",
" 1 | \n",
" AT | \n",
" 7 | \n",
" 2 | \n",
" 3 | \n",
" 1 | \n",
" 5 | \n",
" 2 | \n",
" 0 | \n",
" 1 | \n",
" 7 | \n",
" 8 | \n",
" 4 | \n",
" 1925 | \n",
" 0 | \n",
" 0.565038 | \n",
" 0.302006 | \n",
"
\n",
" \n",
" 2 | \n",
" AT | \n",
" 6 | \n",
" 2 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 6 | \n",
" 2 | \n",
" 4 | \n",
" 13 | \n",
" 6 | \n",
" 1977 | \n",
" 0 | \n",
" 0.341133 | \n",
" 0.302006 | \n",
"
\n",
" \n",
" 3 | \n",
" AT | \n",
" 3 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 7 | \n",
" 1 | \n",
" 5 | \n",
" 8 | \n",
" 9 | \n",
" 1989 | \n",
" 0 | \n",
" 0.804050 | \n",
" 0.302006 | \n",
"
\n",
" \n",
" 4 | \n",
" AT | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 4 | \n",
" 2 | \n",
" 1 | \n",
" 11 | \n",
" 88 | \n",
" 1988 | \n",
" 0 | \n",
" 1.125251 | \n",
" 0.302006 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" cntry tvtot tvpol rdtot rdpol nwsptot nwsppol netuse rlgblg rlgdgr \\\n",
"0 AT 3 2 0 66 4 2 7 1 7 \n",
"1 AT 7 2 3 1 5 2 0 1 7 \n",
"2 AT 6 2 1 0 1 0 6 2 4 \n",
"3 AT 3 1 2 1 2 1 7 1 5 \n",
"4 AT 2 1 1 1 1 1 4 2 1 \n",
"\n",
" eduyrs hinctnta yrbrn eisced pspwght pweight \n",
"0 12 8 1971 0 0.682185 0.302006 \n",
"1 8 4 1925 0 0.565038 0.302006 \n",
"2 13 6 1977 0 0.341133 0.302006 \n",
"3 8 9 1989 0 0.804050 0.302006 \n",
"4 11 88 1988 0 1.125251 0.302006 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2 = pd.read_stata('ESS2e03_4.dta', convert_categoricals=False)\n",
"df2['hinctnta'] = df2.hinctnt\n",
"df2 = select_cols(df2)\n",
"df2.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Read data from Cycle 3."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" cntry | \n",
" tvtot | \n",
" tvpol | \n",
" rdtot | \n",
" rdpol | \n",
" nwsptot | \n",
" nwsppol | \n",
" netuse | \n",
" rlgblg | \n",
" rlgdgr | \n",
" eduyrs | \n",
" hinctnta | \n",
" yrbrn | \n",
" eisced | \n",
" pspwght | \n",
" pweight | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" AT | \n",
" 5 | \n",
" 1 | \n",
" 7 | \n",
" 2 | \n",
" 0 | \n",
" 66 | \n",
" 6 | \n",
" 1 | \n",
" 3 | \n",
" 11 | \n",
" 7 | \n",
" 1980 | \n",
" 0 | \n",
" 0.949578 | \n",
" 0.289116 | \n",
"
\n",
" \n",
" 1 | \n",
" AT | \n",
" 3 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 7 | \n",
" 1 | \n",
" 9 | \n",
" 20 | \n",
" 5 | \n",
" 1974 | \n",
" 0 | \n",
" 1.412180 | \n",
" 0.289116 | \n",
"
\n",
" \n",
" 2 | \n",
" AT | \n",
" 1 | \n",
" 1 | \n",
" 7 | \n",
" 2 | \n",
" 2 | \n",
" 1 | \n",
" 7 | \n",
" 1 | \n",
" 6 | \n",
" 16 | \n",
" 77 | \n",
" 1954 | \n",
" 0 | \n",
" 0.723276 | \n",
" 0.289116 | \n",
"
\n",
" \n",
" 3 | \n",
" AT | \n",
" 4 | \n",
" 1 | \n",
" 7 | \n",
" 2 | \n",
" 3 | \n",
" 2 | \n",
" 6 | \n",
" 1 | \n",
" 5 | \n",
" 12 | \n",
" 88 | \n",
" 1967 | \n",
" 0 | \n",
" 0.625744 | \n",
" 0.289116 | \n",
"
\n",
" \n",
" 4 | \n",
" AT | \n",
" 6 | \n",
" 2 | \n",
" 6 | \n",
" 2 | \n",
" 3 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 7 | \n",
" 11 | \n",
" 88 | \n",
" 1971 | \n",
" 0 | \n",
" 0.417162 | \n",
" 0.289116 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" cntry tvtot tvpol rdtot rdpol nwsptot nwsppol netuse rlgblg rlgdgr \\\n",
"0 AT 5 1 7 2 0 66 6 1 3 \n",
"1 AT 3 1 2 1 2 1 7 1 9 \n",
"2 AT 1 1 7 2 2 1 7 1 6 \n",
"3 AT 4 1 7 2 3 2 6 1 5 \n",
"4 AT 6 2 6 2 3 1 0 1 7 \n",
"\n",
" eduyrs hinctnta yrbrn eisced pspwght pweight \n",
"0 11 7 1980 0 0.949578 0.289116 \n",
"1 20 5 1974 0 1.412180 0.289116 \n",
"2 16 77 1954 0 0.723276 0.289116 \n",
"3 12 88 1967 0 0.625744 0.289116 \n",
"4 11 88 1971 0 0.417162 0.289116 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3 = pd.read_stata('ESS3e03_5.dta', convert_categoricals=False)\n",
"df3['hinctnta'] = df3.hinctnt\n",
"df3 = select_cols(df3)\n",
"df3.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Read data from Cycle 4."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" cntry | \n",
" tvtot | \n",
" tvpol | \n",
" rdtot | \n",
" rdpol | \n",
" nwsptot | \n",
" nwsppol | \n",
" netuse | \n",
" rlgblg | \n",
" rlgdgr | \n",
" eduyrs | \n",
" hinctnta | \n",
" yrbrn | \n",
" eisced | \n",
" pspwght | \n",
" pweight | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" BE | \n",
" 7 | \n",
" 1 | \n",
" 0 | \n",
" 66 | \n",
" 0 | \n",
" 66 | \n",
" 0 | \n",
" 2 | \n",
" 1 | \n",
" 18 | \n",
" 4 | \n",
" 1972 | \n",
" 6 | \n",
" 0.823223 | \n",
" 0.503773 | \n",
"
\n",
" \n",
" 1 | \n",
" BE | \n",
" 7 | \n",
" 2 | \n",
" 7 | \n",
" 3 | \n",
" 0 | \n",
" 66 | \n",
" 0 | \n",
" 2 | \n",
" 0 | \n",
" 15 | \n",
" 7 | \n",
" 1982 | \n",
" 6 | \n",
" 0.798610 | \n",
" 0.503773 | \n",
"
\n",
" \n",
" 2 | \n",
" BE | \n",
" 2 | \n",
" 2 | \n",
" 3 | \n",
" 3 | \n",
" 3 | \n",
" 3 | \n",
" 7 | \n",
" 1 | \n",
" 6 | \n",
" 18 | \n",
" 10 | \n",
" 1940 | \n",
" 7 | \n",
" 0.778020 | \n",
" 0.503773 | \n",
"
\n",
" \n",
" 3 | \n",
" BE | \n",
" 7 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 2 | \n",
" 0 | \n",
" 1 | \n",
" 6 | \n",
" 15 | \n",
" 7 | \n",
" 1931 | \n",
" 6 | \n",
" 0.777735 | \n",
" 0.503773 | \n",
"
\n",
" \n",
" 4 | \n",
" BE | \n",
" 0 | \n",
" 66 | \n",
" 3 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 7 | \n",
" 2 | \n",
" 0 | \n",
" 13 | \n",
" 7 | \n",
" 1982 | \n",
" 4 | \n",
" 0.960960 | \n",
" 0.503773 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" cntry tvtot tvpol rdtot rdpol nwsptot nwsppol netuse rlgblg rlgdgr \\\n",
"0 BE 7 1 0 66 0 66 0 2 1 \n",
"1 BE 7 2 7 3 0 66 0 2 0 \n",
"2 BE 2 2 3 3 3 3 7 1 6 \n",
"3 BE 7 2 2 2 2 2 0 1 6 \n",
"4 BE 0 66 3 0 1 1 7 2 0 \n",
"\n",
" eduyrs hinctnta yrbrn eisced pspwght pweight \n",
"0 18 4 1972 6 0.823223 0.503773 \n",
"1 15 7 1982 6 0.798610 0.503773 \n",
"2 18 10 1940 7 0.778020 0.503773 \n",
"3 15 7 1931 6 0.777735 0.503773 \n",
"4 13 7 1982 4 0.960960 0.503773 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df4 = pd.read_stata('ESS4e04_3.dta', convert_categoricals=False)\n",
"df4 = select_cols(df4)\n",
"df4.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Read data from Cycle 5."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" cntry | \n",
" tvtot | \n",
" tvpol | \n",
" rdtot | \n",
" rdpol | \n",
" nwsptot | \n",
" nwsppol | \n",
" netuse | \n",
" rlgblg | \n",
" rlgdgr | \n",
" eduyrs | \n",
" hinctnta | \n",
" yrbrn | \n",
" eisced | \n",
" pspwght | \n",
" pweight | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" BE | \n",
" 5 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 5 | \n",
" 15 | \n",
" 88 | \n",
" 1988 | \n",
" 4 | \n",
" 0.792865 | \n",
" 0.528619 | \n",
"
\n",
" \n",
" 1 | \n",
" BE | \n",
" 4 | \n",
" 3 | \n",
" 2 | \n",
" 2 | \n",
" 1 | \n",
" 2 | \n",
" 6 | \n",
" 2 | \n",
" 7 | \n",
" 15 | \n",
" 5 | \n",
" 1967 | \n",
" 6 | \n",
" 0.871107 | \n",
" 0.528619 | \n",
"
\n",
" \n",
" 2 | \n",
" BE | \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 66 | \n",
" 0 | \n",
" 66 | \n",
" 7 | \n",
" 2 | \n",
" 0 | \n",
" 13 | \n",
" 1 | \n",
" 1991 | \n",
" 3 | \n",
" 0.799453 | \n",
" 0.528619 | \n",
"
\n",
" \n",
" 3 | \n",
" BE | \n",
" 2 | \n",
" 2 | \n",
" 7 | \n",
" 2 | \n",
" 0 | \n",
" 66 | \n",
" 7 | \n",
" 1 | \n",
" 5 | \n",
" 15 | \n",
" 10 | \n",
" 1987 | \n",
" 6 | \n",
" 0.816030 | \n",
" 0.528619 | \n",
"
\n",
" \n",
" 4 | \n",
" BE | \n",
" 7 | \n",
" 3 | \n",
" 7 | \n",
" 4 | \n",
" 0 | \n",
" 66 | \n",
" 0 | \n",
" 2 | \n",
" 5 | \n",
" 15 | \n",
" 6 | \n",
" 1952 | \n",
" 6 | \n",
" 0.764902 | \n",
" 0.528619 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" cntry tvtot tvpol rdtot rdpol nwsptot nwsppol netuse rlgblg rlgdgr \\\n",
"0 BE 5 1 1 0 1 1 2 1 5 \n",
"1 BE 4 3 2 2 1 2 6 2 7 \n",
"2 BE 4 0 0 66 0 66 7 2 0 \n",
"3 BE 2 2 7 2 0 66 7 1 5 \n",
"4 BE 7 3 7 4 0 66 0 2 5 \n",
"\n",
" eduyrs hinctnta yrbrn eisced pspwght pweight \n",
"0 15 88 1988 4 0.792865 0.528619 \n",
"1 15 5 1967 6 0.871107 0.528619 \n",
"2 13 1 1991 3 0.799453 0.528619 \n",
"3 15 10 1987 6 0.816030 0.528619 \n",
"4 15 6 1952 6 0.764902 0.528619 "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df5 = pd.read_stata('ESS5e03_2.dta', convert_categoricals=False)\n",
"df5 = select_cols(df5)\n",
"df5.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Concatenate the cycles.\n",
"\n",
"TODO: Have to resample each cycle before concatenating."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" cntry | \n",
" tvtot | \n",
" tvpol | \n",
" rdtot | \n",
" rdpol | \n",
" nwsptot | \n",
" nwsppol | \n",
" netuse | \n",
" rlgblg | \n",
" rlgdgr | \n",
" eduyrs | \n",
" hinctnta | \n",
" yrbrn | \n",
" eisced | \n",
" pspwght | \n",
" pweight | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" AT | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 5 | \n",
" 1 | \n",
" 8 | \n",
" 11 | \n",
" 77 | \n",
" 1949 | \n",
" 0 | \n",
" 0.940933 | \n",
" 0.271488 | \n",
"
\n",
" \n",
" 1 | \n",
" AT | \n",
" 3 | \n",
" 2 | \n",
" 4 | \n",
" 1 | \n",
" 2 | \n",
" 2 | \n",
" 6 | \n",
" 2 | \n",
" 5 | \n",
" 14 | \n",
" 2 | \n",
" 1953 | \n",
" 0 | \n",
" 0.470466 | \n",
" 0.271488 | \n",
"
\n",
" \n",
" 2 | \n",
" AT | \n",
" 7 | \n",
" 3 | \n",
" 0 | \n",
" 66 | \n",
" 0 | \n",
" 66 | \n",
" 0 | \n",
" 1 | \n",
" 7 | \n",
" 9 | \n",
" 77 | \n",
" 1940 | \n",
" 0 | \n",
" 1.392155 | \n",
" 0.271488 | \n",
"
\n",
" \n",
" 3 | \n",
" AT | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 2 | \n",
" 2 | \n",
" 4 | \n",
" 1 | \n",
" 7 | \n",
" 18 | \n",
" 9 | \n",
" 1959 | \n",
" 0 | \n",
" 1.382163 | \n",
" 0.271488 | \n",
"
\n",
" \n",
" 4 | \n",
" AT | \n",
" 0 | \n",
" 66 | \n",
" 1 | \n",
" 1 | \n",
" 0 | \n",
" 66 | \n",
" 7 | \n",
" 1 | \n",
" 10 | \n",
" 15 | \n",
" 9 | \n",
" 1962 | \n",
" 0 | \n",
" 1.437766 | \n",
" 0.271488 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" cntry tvtot tvpol rdtot rdpol nwsptot nwsppol netuse rlgblg rlgdgr \\\n",
"0 AT 1 1 1 1 1 1 5 1 8 \n",
"1 AT 3 2 4 1 2 2 6 2 5 \n",
"2 AT 7 3 0 66 0 66 0 1 7 \n",
"3 AT 1 1 1 1 2 2 4 1 7 \n",
"4 AT 0 66 1 1 0 66 7 1 10 \n",
"\n",
" eduyrs hinctnta yrbrn eisced pspwght pweight \n",
"0 11 77 1949 0 0.940933 0.271488 \n",
"1 14 2 1953 0 0.470466 0.271488 \n",
"2 9 77 1940 0 1.392155 0.271488 \n",
"3 18 9 1959 0 1.382163 0.271488 \n",
"4 15 9 1962 0 1.437766 0.271488 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"TV watching time on average weekday"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 8601\n",
"1 12822\n",
"2 33235\n",
"3 32962\n",
"4 40128\n",
"5 30598\n",
"6 29477\n",
"7 53602\n",
"Name: tvtot, dtype: int64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.tvtot.replace([77, 88, 99], np.nan, inplace=True)\n",
"df.tvtot.value_counts().sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Radio listening, total time on average weekday."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 58733\n",
"1 36957\n",
"2 38032\n",
"3 19077\n",
"4 16108\n",
"5 10644\n",
"6 9737\n",
"7 51596\n",
"Name: rdtot, dtype: int64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.rdtot.replace([77, 88, 99], np.nan, inplace=True)\n",
"df.rdtot.value_counts().sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Newspaper reading, total time on average weekday."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 70283\n",
"1 73250\n",
"2 65009\n",
"3 18684\n",
"4 7635\n",
"5 2890\n",
"6 1440\n",
"7 2004\n",
"Name: nwsptot, dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.nwsptot.replace([77, 88, 99], np.nan, inplace=True)\n",
"df.nwsptot.value_counts().sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"TV watching: news, politics, current affairs"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 17050\n",
"1 74427\n",
"2 87036\n",
"3 29333\n",
"4 12751\n",
"5 5137\n",
"6 2895\n",
"7 3822\n",
"Name: tvpol, dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.tvpol.replace([66, 77, 88, 99], np.nan, inplace=True)\n",
"df.tvpol.value_counts().sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Radio listening: news, politics, current affairs"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 32696\n",
"1 77746\n",
"2 40234\n",
"3 13091\n",
"4 6583\n",
"5 3196\n",
"6 2268\n",
"7 5418\n",
"Name: rdpol, dtype: int64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.rdpol.replace([66, 77, 88, 99], np.nan, inplace=True)\n",
"df.rdpol.value_counts().sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Newspaper reading: politics, current affairs"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 24865\n",
"1 102663\n",
"2 32425\n",
"3 6342\n",
"4 2126\n",
"5 745\n",
"6 409\n",
"7 548\n",
"Name: nwsppol, dtype: int64"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.nwsppol.replace([66, 77, 88, 99], np.nan, inplace=True)\n",
"df.nwsppol.value_counts().sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Personal use of Internet, email, www"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 76194\n",
"1 38069\n",
"2 4603\n",
"3 3817\n",
"4 8159\n",
"5 9623\n",
"6 27492\n",
"7 67203\n",
"Name: netuse, dtype: int64"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.netuse.replace([77, 88, 99], np.nan, inplace=True)\n",
"df.netuse.value_counts().sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Belong to a particular religion or denomination"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1 152296\n",
"2 85882\n",
"Name: rlgblg, dtype: int64"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.rlgblg.replace([7, 8, 9], np.nan, inplace=True)\n",
"df.rlgblg.value_counts().sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"How religious"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 30804\n",
"1 13613\n",
"2 16672\n",
"3 19127\n",
"4 15676\n",
"5 41533\n",
"6 23765\n",
"7 28024\n",
"8 24844\n",
"9 11268\n",
"10 14492\n",
"Name: rlgdgr, dtype: int64"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.rlgdgr.replace([77, 88, 99], np.nan, inplace=True)\n",
"df.rlgdgr.value_counts().sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Total household net income, all sources\n",
"\n",
"TODO: It looks like one cycle measured HINCTNT on a 12 point scale. Might need to reconcile"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"1 11124\n",
"2 14462\n",
"3 16930\n",
"4 22156\n",
"5 21217\n",
"6 18625\n",
"7 17126\n",
"8 15892\n",
"9 20462\n",
"10 12154\n",
"11 1810\n",
"12 1177\n",
"Name: hinctnta, dtype: int64"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.hinctnta.replace([77, 88, 99], np.nan, inplace=True)\n",
"df.hinctnta.value_counts().sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Shift income to mean near 0."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"count 173135.000000\n",
"mean 0.683698\n",
"std 2.741579\n",
"min -4.000000\n",
"25% -1.000000\n",
"50% 1.000000\n",
"75% 3.000000\n",
"max 7.000000\n",
"Name: hinctnta5, dtype: float64"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['hinctnta5'] = df.hinctnta - 5\n",
"df.hinctnta5.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Year born"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"count 240948.000000\n",
"mean 1959.308378\n",
"std 18.662873\n",
"min 1885.000000\n",
"25% 1945.000000\n",
"50% 1960.000000\n",
"75% 1974.000000\n",
"max 1996.000000\n",
"Name: yrbrn, dtype: float64"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.yrbrn.replace([7777, 8888, 9999], np.nan, inplace=True)\n",
"df.yrbrn.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Shifted to mean near 0"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"count 240948.000000\n",
"mean -0.691622\n",
"std 18.662873\n",
"min -75.000000\n",
"25% -15.000000\n",
"50% 0.000000\n",
"75% 14.000000\n",
"max 36.000000\n",
"Name: yrbrn60, dtype: float64"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['yrbrn60'] = df.yrbrn - 1960\n",
"df.yrbrn60.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Number of years of education"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.0 2178\n",
"1.0 516\n",
"2.0 811\n",
"3.0 1754\n",
"4.0 5548\n",
"5.0 3736\n",
"6.0 6954\n",
"7.0 6299\n",
"8.0 15688\n",
"9.0 16027\n",
"10.0 17956\n",
"11.0 25579\n",
"11.5 4\n",
"12.0 38669\n",
"13.0 21018\n",
"14.0 16129\n",
"14.5 1\n",
"15.0 15718\n",
"16.0 14073\n",
"17.0 10671\n",
"18.0 8208\n",
"18.5 1\n",
"19.0 3937\n",
"20.0 3810\n",
"21.0 1235\n",
"22.0 986\n",
"23.0 518\n",
"24.0 372\n",
"25.0 796\n",
"Name: eduyrs, dtype: int64"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.eduyrs.replace([77, 88, 99], np.nan, inplace=True)\n",
"df.loc[df.eduyrs > 25, 'eduyrs'] = 25\n",
"df.eduyrs.value_counts().sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"There are a bunch of really high values for eduyrs, need to investigate."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"count 239192.000000\n",
"mean 11.945939\n",
"std 4.057455\n",
"min 0.000000\n",
"25% 10.000000\n",
"50% 12.000000\n",
"75% 15.000000\n",
"max 25.000000\n",
"Name: eduyrs, dtype: float64"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.eduyrs.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Shift to mean near 0"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df['eduyrs12'] = df.eduyrs - 12"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"count 239192.000000\n",
"mean -0.054061\n",
"std 4.057455\n",
"min -12.000000\n",
"25% -2.000000\n",
"50% 0.000000\n",
"75% 3.000000\n",
"max 13.000000\n",
"Name: eduyrs12, dtype: float64"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.eduyrs12.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Country codes"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"AT 6918\n",
"BE 8939\n",
"BG 6064\n",
"CH 9310\n",
"CY 3293\n",
"CZ 8790\n",
"DE 14487\n",
"DK 7684\n",
"EE 6960\n",
"ES 9729\n",
"FI 9991\n",
"FR 9096\n",
"GB 11117\n",
"GR 9759\n",
"HR 3133\n",
"HU 7806\n",
"IE 10472\n",
"IL 7283\n",
"IS 579\n",
"IT 1207\n",
"LT 1677\n",
"LU 3187\n",
"LV 1980\n",
"NL 9741\n",
"NO 8643\n",
"PL 8917\n",
"PT 10302\n",
"RO 2146\n",
"RU 7544\n",
"SE 9201\n",
"SI 7126\n",
"SK 6944\n",
"TR 4272\n",
"UA 7809\n",
"Name: cntry, dtype: int64"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.cntry.value_counts().sort_index()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Make a binary dependent variable"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df['hasrelig'] = (df.rlgblg==1).astype(int)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Run the model"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def run_model(df, formula):\n",
" model = smf.logit(formula, data=df) \n",
" results = model.fit(disp=False)\n",
" return results"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here's the model with all control variables and all media variables:"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"Logit Regression Results\n",
"\n",
" Dep. Variable: | hasrelig | No. Observations: | 93549 | \n",
"
\n",
"\n",
" Model: | Logit | Df Residuals: | 93538 | \n",
"
\n",
"\n",
" Method: | MLE | Df Model: | 10 | \n",
"
\n",
"\n",
" Date: | Mon, 16 Nov 2015 | Pseudo R-squ.: | 0.01844 | \n",
"
\n",
"\n",
" Time: | 09:35:26 | Log-Likelihood: | -62073. | \n",
"
\n",
"\n",
" converged: | True | LL-Null: | -63239. | \n",
"
\n",
"\n",
" | | LLR p-value: | 0.000 | \n",
"
\n",
"
\n",
"\n",
"\n",
" | coef | std err | z | P>|z| | [95.0% Conf. Int.] | \n",
"
\n",
"\n",
" Intercept | 0.7241 | 0.026 | 27.825 | 0.000 | 0.673 0.775 | \n",
"
\n",
"\n",
" yrbrn60 | -0.0064 | 0.000 | -13.662 | 0.000 | -0.007 -0.005 | \n",
"
\n",
"\n",
" eduyrs12 | -0.0133 | 0.002 | -6.450 | 0.000 | -0.017 -0.009 | \n",
"
\n",
"\n",
" hinctnta5 | -0.0269 | 0.003 | -9.669 | 0.000 | -0.032 -0.021 | \n",
"
\n",
"\n",
" tvtot | -0.0260 | 0.004 | -6.054 | 0.000 | -0.034 -0.018 | \n",
"
\n",
"\n",
" tvpol | 0.0031 | 0.007 | 0.461 | 0.645 | -0.010 0.016 | \n",
"
\n",
"\n",
" rdtot | -0.0030 | 0.003 | -0.869 | 0.385 | -0.010 0.004 | \n",
"
\n",
"\n",
" rdpol | 0.0115 | 0.006 | 2.054 | 0.040 | 0.001 0.023 | \n",
"
\n",
"\n",
" nwsptot | 0.0129 | 0.008 | 1.610 | 0.107 | -0.003 0.029 | \n",
"
\n",
"\n",
" nwsppol | 0.0227 | 0.011 | 2.129 | 0.033 | 0.002 0.044 | \n",
"
\n",
"\n",
" netuse | -0.0690 | 0.003 | -24.256 | 0.000 | -0.075 -0.063 | \n",
"
\n",
"
"
],
"text/plain": [
"\n",
"\"\"\"\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: hasrelig No. Observations: 93549\n",
"Model: Logit Df Residuals: 93538\n",
"Method: MLE Df Model: 10\n",
"Date: Mon, 16 Nov 2015 Pseudo R-squ.: 0.01844\n",
"Time: 09:35:26 Log-Likelihood: -62073.\n",
"converged: True LL-Null: -63239.\n",
" LLR p-value: 0.000\n",
"==============================================================================\n",
" coef std err z P>|z| [95.0% Conf. Int.]\n",
"------------------------------------------------------------------------------\n",
"Intercept 0.7241 0.026 27.825 0.000 0.673 0.775\n",
"yrbrn60 -0.0064 0.000 -13.662 0.000 -0.007 -0.005\n",
"eduyrs12 -0.0133 0.002 -6.450 0.000 -0.017 -0.009\n",
"hinctnta5 -0.0269 0.003 -9.669 0.000 -0.032 -0.021\n",
"tvtot -0.0260 0.004 -6.054 0.000 -0.034 -0.018\n",
"tvpol 0.0031 0.007 0.461 0.645 -0.010 0.016\n",
"rdtot -0.0030 0.003 -0.869 0.385 -0.010 0.004\n",
"rdpol 0.0115 0.006 2.054 0.040 0.001 0.023\n",
"nwsptot 0.0129 0.008 1.610 0.107 -0.003 0.029\n",
"nwsppol 0.0227 0.011 2.129 0.033 0.002 0.044\n",
"netuse -0.0690 0.003 -24.256 0.000 -0.075 -0.063\n",
"==============================================================================\n",
"\"\"\""
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"formula = ('hasrelig ~ yrbrn60 + eduyrs12 + hinctnta5 +'\n",
" 'tvtot + tvpol + rdtot + rdpol + nwsptot + nwsppol + netuse')\n",
"res = run_model(df, formula)\n",
"res.summary()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Most of the media variables are not statistically significant. If we drop the politial media variables, we get a cleaner model:"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"Logit Regression Results\n",
"\n",
" Dep. Variable: | hasrelig | No. Observations: | 166035 | \n",
"
\n",
"\n",
" Model: | Logit | Df Residuals: | 166027 | \n",
"
\n",
"\n",
" Method: | MLE | Df Model: | 7 | \n",
"
\n",
"\n",
" Date: | Mon, 16 Nov 2015 | Pseudo R-squ.: | 0.03241 | \n",
"
\n",
"\n",
" Time: | 09:35:27 | Log-Likelihood: | -1.0707e+05 | \n",
"
\n",
"\n",
" converged: | True | LL-Null: | -1.1066e+05 | \n",
"
\n",
"\n",
" | | LLR p-value: | 0.000 | \n",
"
\n",
"
\n",
"\n",
"\n",
" | coef | std err | z | P>|z| | [95.0% Conf. Int.] | \n",
"
\n",
"\n",
" Intercept | 0.9297 | 0.017 | 56.089 | 0.000 | 0.897 0.962 | \n",
"
\n",
"\n",
" yrbrn60 | -0.0077 | 0.000 | -22.772 | 0.000 | -0.008 -0.007 | \n",
"
\n",
"\n",
" eduyrs12 | -0.0339 | 0.002 | -22.558 | 0.000 | -0.037 -0.031 | \n",
"
\n",
"\n",
" hinctnta5 | -0.0327 | 0.002 | -15.633 | 0.000 | -0.037 -0.029 | \n",
"
\n",
"\n",
" tvtot | -0.0225 | 0.003 | -8.498 | 0.000 | -0.028 -0.017 | \n",
"
\n",
"\n",
" rdtot | -0.0122 | 0.002 | -6.187 | 0.000 | -0.016 -0.008 | \n",
"
\n",
"\n",
" nwsptot | -0.0220 | 0.004 | -5.248 | 0.000 | -0.030 -0.014 | \n",
"
\n",
"\n",
" netuse | -0.0753 | 0.002 | -35.148 | 0.000 | -0.079 -0.071 | \n",
"
\n",
"
"
],
"text/plain": [
"\n",
"\"\"\"\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: hasrelig No. Observations: 166035\n",
"Model: Logit Df Residuals: 166027\n",
"Method: MLE Df Model: 7\n",
"Date: Mon, 16 Nov 2015 Pseudo R-squ.: 0.03241\n",
"Time: 09:35:27 Log-Likelihood: -1.0707e+05\n",
"converged: True LL-Null: -1.1066e+05\n",
" LLR p-value: 0.000\n",
"==============================================================================\n",
" coef std err z P>|z| [95.0% Conf. Int.]\n",
"------------------------------------------------------------------------------\n",
"Intercept 0.9297 0.017 56.089 0.000 0.897 0.962\n",
"yrbrn60 -0.0077 0.000 -22.772 0.000 -0.008 -0.007\n",
"eduyrs12 -0.0339 0.002 -22.558 0.000 -0.037 -0.031\n",
"hinctnta5 -0.0327 0.002 -15.633 0.000 -0.037 -0.029\n",
"tvtot -0.0225 0.003 -8.498 0.000 -0.028 -0.017\n",
"rdtot -0.0122 0.002 -6.187 0.000 -0.016 -0.008\n",
"nwsptot -0.0220 0.004 -5.248 0.000 -0.030 -0.014\n",
"netuse -0.0753 0.002 -35.148 0.000 -0.079 -0.071\n",
"==============================================================================\n",
"\"\"\""
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"formula = ('hasrelig ~ yrbrn60 + eduyrs12 + hinctnta5 +'\n",
" 'tvtot + rdtot + nwsptot + netuse')\n",
"res = run_model(df, formula)\n",
"res.summary()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"And if we fill missing values for income, cleaner still."
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def fill_var(df, var):\n",
" fill = df[var].dropna().sample(len(df), replace=True)\n",
" fill.index = df.index\n",
" df[var].fillna(fill, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"fill_var(df, var='hinctnta5')"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"Logit Regression Results\n",
"\n",
" Dep. Variable: | hasrelig | No. Observations: | 229307 | \n",
"
\n",
"\n",
" Model: | Logit | Df Residuals: | 229299 | \n",
"
\n",
"\n",
" Method: | MLE | Df Model: | 7 | \n",
"
\n",
"\n",
" Date: | Mon, 16 Nov 2015 | Pseudo R-squ.: | 0.03200 | \n",
"
\n",
"\n",
" Time: | 09:35:27 | Log-Likelihood: | -1.4610e+05 | \n",
"
\n",
"\n",
" converged: | True | LL-Null: | -1.5093e+05 | \n",
"
\n",
"\n",
" | | LLR p-value: | 0.000 | \n",
"
\n",
"
\n",
"\n",
"\n",
" | coef | std err | z | P>|z| | [95.0% Conf. Int.] | \n",
"
\n",
"\n",
" Intercept | 0.9819 | 0.014 | 70.060 | 0.000 | 0.954 1.009 | \n",
"
\n",
"\n",
" yrbrn60 | -0.0078 | 0.000 | -27.847 | 0.000 | -0.008 -0.007 | \n",
"
\n",
"\n",
" eduyrs12 | -0.0375 | 0.001 | -29.599 | 0.000 | -0.040 -0.035 | \n",
"
\n",
"\n",
" hinctnta5 | -0.0226 | 0.002 | -13.297 | 0.000 | -0.026 -0.019 | \n",
"
\n",
"\n",
" tvtot | -0.0163 | 0.002 | -7.254 | 0.000 | -0.021 -0.012 | \n",
"
\n",
"\n",
" rdtot | -0.0149 | 0.002 | -8.839 | 0.000 | -0.018 -0.012 | \n",
"
\n",
"\n",
" nwsptot | -0.0319 | 0.004 | -8.908 | 0.000 | -0.039 -0.025 | \n",
"
\n",
"\n",
" netuse | -0.0757 | 0.002 | -42.036 | 0.000 | -0.079 -0.072 | \n",
"
\n",
"
"
],
"text/plain": [
"\n",
"\"\"\"\n",
" Logit Regression Results \n",
"==============================================================================\n",
"Dep. Variable: hasrelig No. Observations: 229307\n",
"Model: Logit Df Residuals: 229299\n",
"Method: MLE Df Model: 7\n",
"Date: Mon, 16 Nov 2015 Pseudo R-squ.: 0.03200\n",
"Time: 09:35:27 Log-Likelihood: -1.4610e+05\n",
"converged: True LL-Null: -1.5093e+05\n",
" LLR p-value: 0.000\n",
"==============================================================================\n",
" coef std err z P>|z| [95.0% Conf. Int.]\n",
"------------------------------------------------------------------------------\n",
"Intercept 0.9819 0.014 70.060 0.000 0.954 1.009\n",
"yrbrn60 -0.0078 0.000 -27.847 0.000 -0.008 -0.007\n",
"eduyrs12 -0.0375 0.001 -29.599 0.000 -0.040 -0.035\n",
"hinctnta5 -0.0226 0.002 -13.297 0.000 -0.026 -0.019\n",
"tvtot -0.0163 0.002 -7.254 0.000 -0.021 -0.012\n",
"rdtot -0.0149 0.002 -8.839 0.000 -0.018 -0.012\n",
"nwsptot -0.0319 0.004 -8.908 0.000 -0.039 -0.025\n",
"netuse -0.0757 0.002 -42.036 0.000 -0.079 -0.072\n",
"==============================================================================\n",
"\"\"\""
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"formula = ('hasrelig ~ yrbrn60 + eduyrs12 + hinctnta5 +'\n",
" 'tvtot + rdtot + nwsptot + netuse')\n",
"res = run_model(df, formula)\n",
"res.summary()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now all variables have small p-values. All parameters have the expected signs:\n",
"\n",
"1. Younger people are less affiliated.\n",
"2. More educated people are less affiliated.\n",
"3. Higher income people are less affiliated (although this could go either way)\n",
"4. Consumers of all media are less affiliated.\n",
"\n",
"The strength of the Internet effect is stronger than for other media.\n",
"\n",
"These results are consistent in each cycle of the data, and across a few changes I've made in the cleaning process.\n",
"\n",
"However, these results should be considered preliminary:\n",
"\n",
"1. I have not dealt with the stratification weights.\n",
"2. I have not dealt with missing data (particularly important for education)\n",
"\n",
"Nevertheless, I'll run a breakdown by country.\n",
"\n",
"Here's a function to extract the parameter associated with netuse:"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(229307, -0.075724674567100483, '**')"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def extract_res(res, var='netuse'):\n",
" param = res.params[var]\n",
" pvalue = res.pvalues[var]\n",
" stars = '**' if pvalue < 0.01 else '*' if pvalue < 0.05 else ''\n",
" return res.nobs, param, stars\n",
"\n",
"extract_res(res)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Running a similar model with degree of religiosity."
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"OLS Regression Results\n",
"\n",
" Dep. Variable: | rlgdgr | R-squared: | 0.060 | \n",
"
\n",
"\n",
" Model: | OLS | Adj. R-squared: | 0.060 | \n",
"
\n",
"\n",
" Method: | Least Squares | F-statistic: | 2067. | \n",
"
\n",
"\n",
" Date: | Mon, 16 Nov 2015 | Prob (F-statistic): | 0.00 | \n",
"
\n",
"\n",
" Time: | 09:35:28 | Log-Likelihood: | -5.6310e+05 | \n",
"
\n",
"\n",
" No. Observations: | 227366 | AIC: | 1.126e+06 | \n",
"
\n",
"\n",
" Df Residuals: | 227358 | BIC: | 1.126e+06 | \n",
"
\n",
"\n",
" Df Model: | 7 | | | \n",
"
\n",
"\n",
" Covariance Type: | nonrobust | | | \n",
"
\n",
"
\n",
"\n",
"\n",
" | coef | std err | t | P>|t| | [95.0% Conf. Int.] | \n",
"
\n",
"\n",
" Intercept | 5.6668 | 0.019 | 300.071 | 0.000 | 5.630 5.704 | \n",
"
\n",
"\n",
" yrbrn60 | -0.0180 | 0.000 | -47.352 | 0.000 | -0.019 -0.017 | \n",
"
\n",
"\n",
" eduyrs12 | -0.0688 | 0.002 | -40.159 | 0.000 | -0.072 -0.065 | \n",
"
\n",
"\n",
" hinctnta5 | -0.0266 | 0.002 | -11.397 | 0.000 | -0.031 -0.022 | \n",
"
\n",
"\n",
" tvtot | -0.0801 | 0.003 | -26.334 | 0.000 | -0.086 -0.074 | \n",
"
\n",
"\n",
" rdtot | -0.0179 | 0.002 | -7.791 | 0.000 | -0.022 -0.013 | \n",
"
\n",
"\n",
" nwsptot | -0.0531 | 0.005 | -10.873 | 0.000 | -0.063 -0.044 | \n",
"
\n",
"\n",
" netuse | -0.1020 | 0.002 | -40.942 | 0.000 | -0.107 -0.097 | \n",
"
\n",
"
\n",
"\n",
"\n",
" Omnibus: | 29438.962 | Durbin-Watson: | 1.629 | \n",
"
\n",
"\n",
" Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 8137.927 | \n",
"
\n",
"\n",
" Skew: | -0.142 | Prob(JB): | 0.00 | \n",
"
\n",
"\n",
" Kurtosis: | 2.118 | Cond. No. | 59.2 | \n",
"
\n",
"
"
],
"text/plain": [
"\n",
"\"\"\"\n",
" OLS Regression Results \n",
"==============================================================================\n",
"Dep. Variable: rlgdgr R-squared: 0.060\n",
"Model: OLS Adj. R-squared: 0.060\n",
"Method: Least Squares F-statistic: 2067.\n",
"Date: Mon, 16 Nov 2015 Prob (F-statistic): 0.00\n",
"Time: 09:35:28 Log-Likelihood: -5.6310e+05\n",
"No. Observations: 227366 AIC: 1.126e+06\n",
"Df Residuals: 227358 BIC: 1.126e+06\n",
"Df Model: 7 \n",
"Covariance Type: nonrobust \n",
"==============================================================================\n",
" coef std err t P>|t| [95.0% Conf. Int.]\n",
"------------------------------------------------------------------------------\n",
"Intercept 5.6668 0.019 300.071 0.000 5.630 5.704\n",
"yrbrn60 -0.0180 0.000 -47.352 0.000 -0.019 -0.017\n",
"eduyrs12 -0.0688 0.002 -40.159 0.000 -0.072 -0.065\n",
"hinctnta5 -0.0266 0.002 -11.397 0.000 -0.031 -0.022\n",
"tvtot -0.0801 0.003 -26.334 0.000 -0.086 -0.074\n",
"rdtot -0.0179 0.002 -7.791 0.000 -0.022 -0.013\n",
"nwsptot -0.0531 0.005 -10.873 0.000 -0.063 -0.044\n",
"netuse -0.1020 0.002 -40.942 0.000 -0.107 -0.097\n",
"==============================================================================\n",
"Omnibus: 29438.962 Durbin-Watson: 1.629\n",
"Prob(Omnibus): 0.000 Jarque-Bera (JB): 8137.927\n",
"Skew: -0.142 Prob(JB): 0.00\n",
"Kurtosis: 2.118 Cond. No. 59.2\n",
"==============================================================================\n",
"\n",
"Warnings:\n",
"[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
"\"\"\""
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"formula = ('rlgdgr ~ yrbrn60 + eduyrs12 + hinctnta5 +'\n",
" 'tvtot + rdtot + nwsptot + netuse')\n",
"model = smf.ols(formula, data=df) \n",
"res = model.fit(disp=False)\n",
"res.summary()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Group by country:"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"AT 6918\n",
"BE 8939\n",
"BG 6064\n",
"CH 9310\n",
"CY 3293\n",
"CZ 8790\n",
"DE 14487\n",
"DK 7684\n",
"EE 6960\n",
"ES 9729\n",
"FI 9991\n",
"FR 9096\n",
"GB 11117\n",
"GR 9759\n",
"HR 3133\n",
"HU 7806\n",
"IE 10472\n",
"IL 7283\n",
"IS 579\n",
"IT 1207\n",
"LT 1677\n",
"LU 3187\n",
"LV 1980\n",
"NL 9741\n",
"NO 8643\n",
"PL 8917\n",
"PT 10302\n",
"RO 2146\n",
"RU 7544\n",
"SE 9201\n",
"SI 7126\n",
"SK 6944\n",
"TR 4272\n",
"UA 7809\n"
]
}
],
"source": [
"grouped = df.groupby('cntry')\n",
"for name, group in grouped:\n",
" print(name, len(group))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Run a sample country"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "ValueError",
"evalue": "endog must be in the unit interval.",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0mgb\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgrouped\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_group\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'DK'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mrun_model\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mgb\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mformula\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msummary\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m\u001b[0m in \u001b[0;36mrun_model\u001b[1;34m(df, formula)\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mrun_model\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mformula\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msmf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlogit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mformula\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mresults\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdisp\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresults\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/downey/anaconda/lib/python2.7/site-packages/statsmodels/base/model.pyc\u001b[0m in \u001b[0;36mfrom_formula\u001b[1;34m(cls, formula, data, subset, *args, **kwargs)\u001b[0m\n\u001b[0;32m 148\u001b[0m kwargs.update({'missing_idx': missing_idx,\n\u001b[0;32m 149\u001b[0m 'missing': missing})\n\u001b[1;32m--> 150\u001b[1;33m \u001b[0mmod\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcls\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mendog\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexog\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 151\u001b[0m \u001b[0mmod\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformula\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mformula\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 152\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/downey/anaconda/lib/python2.7/site-packages/statsmodels/discrete/discrete_model.pyc\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, endog, exog, **kwargs)\u001b[0m\n\u001b[0;32m 402\u001b[0m if (self.__class__.__name__ != 'MNLogit' and\n\u001b[0;32m 403\u001b[0m not np.all((self.endog >= 0) & (self.endog <= 1))):\n\u001b[1;32m--> 404\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"endog must be in the unit interval.\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 405\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 406\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mValueError\u001b[0m: endog must be in the unit interval."
]
}
],
"source": [
"gb = grouped.get_group('DK')\n",
"run_model(gb, formula).summary()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Run all countries"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"for name, group in grouped:\n",
" try:\n",
" fill_var(group, var='hinctnta5')\n",
" res = run_model(group, formula)\n",
" nobs, param, stars = extract_res(res)\n",
" arrow = '<--' if stars and param > 0 else ''\n",
" print(name, len(group), nobs, '%0.3g'%param, stars, arrow, sep='\\t')\n",
" except:\n",
" print(name, len(group), ' ', 'NA', sep='\\t')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In more than half of the countries, the association between Internet use and religious affiliation is statistically significant. In all except two, the association is negative.\n",
"\n",
"In many countries we've lost a substantial number of observations due to missing data. Really need to fill that in!"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"group = grouped.get_group('FR')\n",
"len(group)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"for col in group.columns:\n",
" print(col, sum(group[col].isnull()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"fill_var(group, 'hinctnta5')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"formula"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"res = run_model(group, formula)\n",
"res.summary()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 0
}