<!DOCTYPE html> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta charset="utf-8"> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <meta name="generator" content="pandoc" /> <meta name="viewport" content="width=device-width, initial-scale=1"> <title>Multiple Imputation</title> <script src="libs/jquery-1.11.3/jquery.min.js"></script> <script src="libs/jqueryui-1.11.4/jquery-ui.min.js"></script> <link href="libs/tocify-1.9.1/jquery.tocify.css" rel="stylesheet" /> <script src="libs/tocify-1.9.1/jquery.tocify.js"></script> <meta name="viewport" content="width=device-width, initial-scale=1" /> <link href="libs/bootstrap-3.3.5/css/flatly.min.css" rel="stylesheet" /> <script src="libs/bootstrap-3.3.5/js/bootstrap.min.js"></script> <script src="libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script> <script src="libs/bootstrap-3.3.5/shim/respond.min.js"></script> <link rel="stylesheet" href="libs/font-awesome-4.1.0/css/font-awesome.min.css"/> <link rel="stylesheet" href="pols503.css"/> <style type="text/css">code{white-space: pre;}</style> <link rel="stylesheet" href="libs/highlight/textmate.css" type="text/css" /> <script src="libs/highlight/highlight.js"></script> <style type="text/css"> pre:not([class]) { background-color: white; } </style> <script type="text/javascript"> if (window.hljs && document.readyState && document.readyState === "complete") { window.setTimeout(function() { hljs.initHighlighting(); }, 0); } </script> </head> <body> <style type = "text/css"> .main-container { max-width: 940px; margin-left: auto; margin-right: auto; } code { color: inherit; background-color: rgba(0, 0, 0, 0.04); } img { max-width:100%; height: auto; } h1 { font-size: 34px; } h1.title { font-size: 38px; } h2 { font-size: 30px; } h3 { font-size: 24px; } h4 { font-size: 18px; } h5 { font-size: 16px; } h6 { font-size: 12px; } .tabbed-pane { padding-top: 12px; } button.code-folding-btn:focus { outline: none; } </style> <style type="text/css"> /* padding for bootstrap navbar */ body { padding-top: 60px; padding-bottom: 40px; } /* offset scroll position for anchor links (for fixed navbar) */ .section h1 { padding-top: 65px; margin-top: -65px; } .section h2 { padding-top: 65px; margin-top: -65px; } .section h3 { padding-top: 65px; margin-top: -65px; } .section h4 { padding-top: 65px; margin-top: -65px; } .section h5 { padding-top: 65px; margin-top: -65px; } .section h6 { padding-top: 65px; margin-top: -65px; } </style> <script> // manage active state of menu based on current page $(document).ready(function () { // active menu anchor href = window.location.pathname href = href.substr(href.lastIndexOf('/') + 1) if (href === "") href = "index.html"; var menuAnchor = $('a[href="' + href + '"]'); // mark it active menuAnchor.parent().addClass('active'); // if it's got a parent navbar menu mark it active as well menuAnchor.closest('li.dropdown').addClass('active'); }); </script> <div class="container-fluid main-container"> <!-- tabsets --> <script src="libs/navigation-1.0/tabsets.js"></script> <script> $(document).ready(function () { window.buildTabsets("TOC"); }); </script> <!-- code folding --> <script> $(document).ready(function () { // establish options var options = { selectors: "h1,h2", theme: "bootstrap3", context: '.toc-content', hashGenerator: function (text) { return text.replace(/[.\/?&!#<>]/g, '').replace(/\s/g, '_').toLowerCase(); }, ignoreSelector: "h1.title, .toc-ignore", scrollTo: 0 }; options.showAndHide = true; options.smoothScroll = true; // tocify var toc = $("#TOC").tocify(options).data("toc-tocify"); }); </script> <style type="text/css"> #TOC { margin: 25px 0px 20px 0px; } @media (max-width: 768px) { #TOC { position: relative; width: 100%; } } .toc-content { padding-left: 30px; padding-right: 40px; } div.main-container { max-width: 1200px; } div.tocify { width: 20%; max-width: 260px; max-height: 85%; } @media (min-width: 768px) and (max-width: 991px) { div.tocify { width: 25%; } } @media (max-width: 767px) { div.tocify { width: 100%; max-width: none; } } .tocify ul, .tocify li { line-height: 20px; } .tocify-subheader .tocify-item { font-size: 0.9em; padding-left: 5px; } .tocify .list-group-item { border-radius: 0px; } </style> <!-- setup 3col/9col grid for toc_float and main content --> <div class="row-fluid"> <div class="col-xs-12 col-sm-4 col-md-3"> <div id="TOC" class="tocify"> </div> </div> <div class="toc-content col-xs-12 col-sm-8 col-md-9"> <div class="navbar navbar-default navbar-fixed-top" role="navigation"> <div class="container"> <div class="navbar-header"> <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar"> <span class="icon-bar"></span> <span class="icon-bar"></span> <span class="icon-bar"></span> </button> <a class="navbar-brand" href="https://UW-POLS503.github.io/pols_503_sp16">POLS/CS&SS 503</a> </div> <div id="navbar" class="navbar-collapse collapse"> <ul class="nav navbar-nav"> <li><a href="index.html">Home</a></li> <li><a href="schedule.html">Schedule</a></li> <li><a href="https://uw-pols503.github.io/pols503-notes/">Notes</a></li> <!-- start assignments dropdown --> <li class="dropdown"> <a href="assignments" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Assignments <span class="caret"></span></a> <ul class="dropdown-menu" role="menu"> <!-- ADD NEW ASSIGNMENTS HERE --> <li class="dropdown-header">Assignments</li> <li><a href="https://github.com/UW-POLS503/Assignment_01">Assignment 1</a></li> <li class="divider"></li> <li class="dropdown-header">Project</li> <li><a href="assignments_project_1.html">Project Assignment 1</a></li> <li><a href="assignments_project_2.html">Project Assignment 2</a></li> <li><a href="assignments_project_3.html">Project Assignment 3</a></li> <li><a href="data_analysis_project_paper_guidelines.html">Final Project</a></li> <li class="divider"></li> <li class="dropdown-header">Peer Review</li> <li><a href="assignments_peer_review_1.html">Peer Review 1</a></li> <li><a href="assignments_peer_review_2.html">Peer Review 2</a></li> </ul> </li> <!-- end assignments dropdown --> <!-- start lessons dropdown --> <li class="dropdown"> <a href="lessons" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Lessons <span class="caret"></span></a> <ul class="dropdown-menu" role="menu"> <!-- ADD NEW LESSONS HERE --> <li><a href="lessons_install_R.html">Installing R</a></li> <li><a href="lessons_git.html">Getting Started with Git and GitHub</a></li> <li><a href="lessons_writing_functions.html">Writing Functions</a></li> <li><a href="lessons_loops_conditionals.html">Loops and Conditional Execution</a></li> <li><a href="lessons_functional_forms2.html">Functional Forms</a></li> <li><a href="lessons_imputation.html">Multiple Imputation</a></li> <li><a href="lessons_weights.html">Weights</a></li> <li><a href="lessons_categorical_covariates.html">Categorical covariates</a></li> </ul> </li> <!-- end lessons dropdown --> <!-- start references dropdown --> <li class="dropdown"> <a href="references" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">References <span class="caret"></span></a> <ul class="dropdown-menu" role="menu"> <!-- ADD NEW REFERENCE PAGES HERE --> <li><a href="writing-advice.html">Writing Advice</a></li> <li><a href="latex4research.html">LaTeX</a></li> <li><a href="word4research.html">Word for Research</a></li> <li><a href="Rmarkdown.html">R Markdown</a></li> <li><a href="stata_to_R.html">Moving from Stata to R</a></li> <li><a href="submitting-assign.html"> Submitting Assignments</a></li> </ul> </li> <!-- end references dropdown --> </ul> <ul class="nav navbar-nav navbar-right"> <li><a href="https://github.com/UW-POLS503/pols_503_sp16/issues">Report Bug</a></li> </ul> </div><!--/.nav-collapse --> </div><!--/.container --> </div><!--/.navbar --> <div class="fluid-row" id="header"> <h1 class="title">Multiple Imputation</h1> </div> <div id="required-packages-and-datasets" class="section level2"> <h2>Required Packages and Datasets</h2> <p>In this lesson we will use the <code>Amelia</code> package and a subset of Beatrice Magistro’s <a href="https://www.dropbox.com/s/521gqn9neemmsd3/ess_sub.csv?dl=0">dataset</a>, with data from the <code>European Social Survey</code>.</p> <pre class="r"><code># install.packages("Amelia") library(Amelia) library(ggplot2) library(dplyr) ess_sub <- read.csv("data/ess_sub.csv")</code></pre> <p>Each row in the dataset is an indivdual’s response to the survey and it has the following variables:</p> <table> <thead> <tr class="header"> <th align="left">variable</th> <th align="left">description</th> </tr> </thead> <tbody> <tr class="odd"> <td align="left"><code>stfdem</code></td> <td align="left">Satisfaction with democracy: {1-10 scale} (ordinal)</td> </tr> <tr class="even"> <td align="left"><code>year</code></td> <td align="left">Year: {2002, 2003, …, 2012} (time variable)</td> </tr> <tr class="odd"> <td align="left"><code>cntry</code></td> <td align="left">Country: {DE, GB, …, NL} (cross section variable)</td> </tr> <tr class="even"> <td align="left"><code>crisis</code></td> <td align="left">Crisis: {post, pre} (ordinal)</td> </tr> <tr class="odd"> <td align="left"><code>age_gr</code></td> <td align="left">Age group: {18-34, 35-64, +65} (ordinal)</td> </tr> <tr class="even"> <td align="left"><code>edulvla</code></td> <td align="left">Education level: {low, medium, high} (ordinal)</td> </tr> <tr class="odd"> <td align="left"><code>gndr</code></td> <td align="left">Gender: {Men, Women} (categorical)</td> </tr> <tr class="even"> <td align="left"><code>peripherial</code></td> <td align="left">Peripherial countries: {core, peri} (categorical)</td> </tr> </tbody> </table> </div> <div id="introduction" class="section level2"> <h2>Introduction</h2> <p>Often the datasets we use to test our theories and hypotheses have some, and sometimes numerous, missing values (<code>NA</code>). What do we do?</p> <p>One option would be to drop the rows that have a missing value for one of our covariates and/or the variable of interest. For example, when we estimate a linear model, <code>R</code> automatically gets rid of the rows with missing values for the variables in the model (“listwise deletion”). In this case 7,344 observations.</p> <pre class="r"><code>model <- lm(stfdem ~ crisis + age_gr + edulvla + gndr + peripherial, data = ess_sub) summary(model)</code></pre> <pre><code>## ## Call: ## lm(formula = stfdem ~ crisis + age_gr + edulvla + gndr + peripherial, ## data = ess_sub) ## ## Residuals: ## Min 1Q Median 3Q Max ## -6.0930 -1.6938 0.1299 1.8842 5.6123 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 5.87012 0.01879 312.469 < 2e-16 *** ## crisispre 0.22285 0.01177 18.931 < 2e-16 *** ## age_gr18-34 -0.10679 0.01754 -6.087 1.15e-09 *** ## age_gr35-64 -0.20238 0.01500 -13.488 < 2e-16 *** ## edulvlalow -0.71168 0.01522 -46.752 < 2e-16 *** ## edulvlamedium -0.62842 0.01382 -45.470 < 2e-16 *** ## gndrWomen -0.24200 0.01134 -21.349 < 2e-16 *** ## peripherialperi -0.32635 0.01316 -24.800 < 2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 2.414 on 182566 degrees of freedom ## (7344 observations deleted due to missingness) ## Multiple R-squared: 0.02418, Adjusted R-squared: 0.02414 ## F-statistic: 646.1 on 7 and 182566 DF, p-value: < 2.2e-16</code></pre> <p>However, despite not having a value for a particular variable or set of variables, thoses row may provide information about the other variables. Moreover, dropping those observation may result in biased and inefficient estimates.</p> <p>The other option is to <strong>impute</strong> (“fill in”) the missing values. In this lesson we will see one method and <code>R package</code> to do so: <code>Amelia</code>. This method assumes that the data:</p> <ol style="list-style-type: lower-alpha"> <li>follows a multivariate normal distribution</li> <li>is missing at random -MAR- (which means that the <em>missingness</em> depends only on the observed data)</li> </ol> <p>Then the <code>Amelia</code> uses a bootstrap-EM algorithm (EMB) to estimate/impute the missing values.</p> </div> <div id="exploring-missing-values" class="section level2"> <h2>Exploring missing values</h2> <p>The <code>summary()</code> function provides you with information about the number of missing values per variable. This dataset has missing values for the outcome variable (<code>stfdem</code>) and the covariates <code>edulvla</code> and <code>gndr</code>.</p> <pre class="r"><code>summary(ess_sub)</code></pre> <pre><code>## stfdem year cntry crisis ## Min. : 0.000 Min. :2002 DE : 16145 post: 68948 ## 1st Qu.: 4.000 1st Qu.:2004 GB : 12521 pre :120970 ## Median : 5.000 Median :2008 IE : 12324 ## Mean : 5.189 Mean :2007 PT : 11774 ## 3rd Qu.: 7.000 3rd Qu.:2010 FI : 11260 ## Max. :10.000 Max. :2012 NL : 11048 ## NA's :6405 (Other):114846 ## age_gr edulvla gndr peripherial ## +65 : 40674 high :55013 Men : 88053 core:137997 ## 18-34: 46887 low :61545 Women:101752 peri: 51921 ## 35-64:102357 medium:72443 NA's : 113 ## NA's : 917 ## ## ## </code></pre> <p>The <code>Amelia</code> package has a function that helps visualizing the missing data in a dataset: <code>missmap()</code>. Warning: takes few minutes to run, depending on the size of the dataset.</p> <pre class="r"><code>missmap(ess_sub)</code></pre> <p><img src="lessons_imputation_files/figure-html/unnamed-chunk-5-1.png" title="" alt="" width="672" /></p> </div> <div id="imputation" class="section level2"> <h2>Imputation</h2> <p>The <code>amelia()</code> function takes the following parameters:</p> <ul> <li><code>x</code>: the dataset (e.g. ess_sub)</li> <li><code>m</code>: number of imputed datasets to create (e.g. 5, see Fox, p.564)</li> <li><code>logs</code>: a vector with variables that are <code>log</code> transformations</li> <li><code>logstc</code>: a vector with variables that are <code>logistic</code> transformations</li> <li><code>noms</code>: a vector with variables that are nominal (e.g. <code>gnder</code>, <code>peripherial</code>)</li> <li><code>ords</code>: a vector with variables that are ordinal (e.g. <code>stfdem</code>)</li> <li><code>ts</code>: name of the variable indicating time (for time series data) (e.g. <code>year</code>)</li> <li><code>cs</code>: name of the cross section variable (for cross section data) (e.g. <code>cntry</code>)</li> <li><code>idvars</code>: name of a variablde indicating ID, so <code>Amelia</code> can ignore it</li> </ul> <p>The <code>amelia()</code> function will create <code>m</code> (so 5) new datasets with imputed values for the all missing values in<code>ess_sub</code>.</p> <pre class="r"><code>m <- 5 amelia_output <- amelia(ess_sub, m = m, ts = "year" , cs = "cntry", ords = c("stfdem", "crisis", "age_gr","edulvla"), noms = c("gndr", "peripherial"))</code></pre> <pre><code>## -- Imputation 1 -- ## ## 1 2 ## ## -- Imputation 2 -- ## ## 1 2 ## ## -- Imputation 3 -- ## ## 1 2 ## ## -- Imputation 4 -- ## ## 1 2 ## ## -- Imputation 5 -- ## ## 1 2</code></pre> <p>The imputed datasets are in the <code>imputations</code> list within the <code>amelia_output</code>. You can combine them into one dataset using the following loop:</p> <pre class="r"><code>ess_all <- NULL for (i in 1:length(amelia_output$imputations)) { imp <- as.data.frame(amelia_output$imputations[i]) colnames(imp) <- colnames(ess_sub) imp$imp <- paste0("imp", i) ess_all <- rbind(ess_all, imp) }</code></pre> <p>You can also access a matrix indicating the original missing values in the <code>amelia_output</code>. This will be useful to compare differences between actual and imputed data and to make judgements about the quality/plausibility of the imputed data.</p> <pre class="r"><code>missMatrix <- amelia_output$missMatrix</code></pre> </div> <div id="diagnostics" class="section level2"> <h2>Diagnostics</h2> <p>For the variables with missing values (e.g. <code>stfdem</code>, <code>edulvla</code>), explore differences across imputed datasets.</p> <pre class="r"><code>ggplot(ess_all %>% group_by(imp,stfdem)%>% summarize(n = n()), aes(y = n, x = factor(stfdem))) + geom_bar(stat = "identity") + geom_text(aes(label = n), vjust = 0.5, y = 1000, hjust = 0,position = , angle = 90, color = "white") + facet_wrap(~ imp)</code></pre> <p><img src="lessons_imputation_files/figure-html/unnamed-chunk-9-1.png" title="" alt="" width="672" /></p> <pre class="r"><code>ggplot(ess_all %>% group_by(imp,edulvla)%>% summarize(n = n()), aes(y = n, x = factor(edulvla))) + geom_bar(stat = "identity") + geom_text(aes(label = n), vjust = 0.5, y = 20000, hjust = 0.5,position = , color = "white") + facet_wrap(~ imp)</code></pre> <p><img src="lessons_imputation_files/figure-html/unnamed-chunk-10-1.png" title="" alt="" width="672" /></p> <p>Check Jeff Arnold’s lesson on <a href="https://uw-pols503.github.io/pols_503_sp16/missing_data_imputation.html#models">Multiple Imputation</a> to see other ways of evaluating the plausibility of the imputated data.</p> </div> <div id="combining-imputated-datasets-in-your-analysis" class="section level2"> <h2>Combining imputated datasets in your analysis</h2> <p>See Fox p.656 for details about how to combine the coefficients from different imputated datasets.</p> <p>The <code>Amelia</code> package has a function to do so: <code>mi.meld()</code>. The function takes 2 paramters:</p> <ul> <li><code>q</code>: a dataset/matrix with <em>k</em> rows (<em>k</em> = number of imputated datasets) and <em>v</em> columns (<em>v</em> = number of covariates in the model). This dataset contains the coefficients for all covariates across imputated datasets.</li> <li><code>se</code>: a dataset/matrix with <em>k</em> rows (<em>k</em> = number of imputated datasets) and <em>v</em> columns (<em>v</em> = number of covariates in the model). This dataset contains the standard errors for all covariates across imputated datasets.</li> </ul> <p>Calculate <code>q</code> and <code>se</code></p> <pre class="r"><code>q <- NULL se <- NULL form <- formula(stfdem ~ + crisis + age_gr + edulvla + gndr + peripherial) for (i in 1:m) { model <- lm(form, data = amelia_output$imputations[[i]]) q <- rbind(q, coef(model)) se <- rbind(se, coef(summary(model))[,2]) } q</code></pre> <pre><code>## (Intercept) crisispre age_gr18-34 age_gr35-64 edulvlalow ## [1,] 5.874327 0.2180225 -0.11535668 -0.2099701 -0.6956348 ## [2,] 5.867108 0.2221274 -0.11024628 -0.2062861 -0.6938102 ## [3,] 5.867050 0.2222165 -0.09941342 -0.2009758 -0.6930619 ## [4,] 5.868573 0.2184105 -0.10471958 -0.2029573 -0.6904310 ## [5,] 5.864418 0.2221892 -0.10514010 -0.1998890 -0.6960684 ## edulvlamedium gndrWomen peripherialperi ## [1,] -0.6297112 -0.2389228 -0.3270229 ## [2,] -0.6276297 -0.2380649 -0.3295379 ## [3,] -0.6298683 -0.2419523 -0.3363500 ## [4,] -0.6307592 -0.2403058 -0.3271767 ## [5,] -0.6299122 -0.2383574 -0.3305466</code></pre> <pre class="r"><code>se</code></pre> <pre><code>## (Intercept) crisispre age_gr18-34 age_gr35-64 edulvlalow ## [1,] 0.01855185 0.01160933 0.01719708 0.01468296 0.01498298 ## [2,] 0.01855029 0.01161053 0.01719835 0.01468398 0.01498446 ## [3,] 0.01854366 0.01160459 0.01719172 0.01467780 0.01497655 ## [4,] 0.01854561 0.01160602 0.01719158 0.01467775 0.01498019 ## [5,] 0.01855177 0.01160988 0.01719767 0.01468360 0.01498452 ## edulvlamedium gndrWomen peripherialperi ## [1,] 0.01372982 0.01118389 0.01293630 ## [2,] 0.01372948 0.01118494 0.01293854 ## [3,] 0.01372459 0.01117896 0.01292999 ## [4,] 0.01372468 0.01118066 0.01293270 ## [5,] 0.01372955 0.01118407 0.01293638</code></pre> <pre class="r"><code>cb_model <- mi.meld(q = q, se = se) cb_model_df <- data.frame(varname = colnames(cb_model[[1]]), coef = as.numeric(cb_model[[1]]), se = as.numeric(cb_model[[2]]), coef.lwr = as.numeric(cb_model[[1]] - (2 * as.numeric(cb_model[[2]]))), coef.upr = as.numeric(cb_model[[1]] + (2 * as.numeric(cb_model[[2]])))) cb_model_df</code></pre> <pre><code>## varname coef se coef.lwr coef.upr ## 1 (Intercept) 5.8682952 0.01898387 5.8303274 5.90626290 ## 2 crisispre 0.2205932 0.01184989 0.1968934 0.24429300 ## 3 age_gr18-34 -0.1069752 0.01842968 -0.1438346 -0.07011585 ## 4 age_gr35-64 -0.2040157 0.01536028 -0.2347362 -0.17329510 ## 5 edulvlalow -0.6938013 0.01518468 -0.7241706 -0.66343191 ## 6 edulvlamedium -0.6295761 0.01378656 -0.6571492 -0.60200299 ## 7 gndrWomen -0.2395207 0.01132061 -0.2621619 -0.21687945 ## 8 peripherialperi -0.3301268 0.01358620 -0.3572992 -0.30295442</code></pre> </div> <!-- some extra javascript for older browsers --> <script type="text/javascript" src="libs/polyfill.js"></script> </div> </div> </div> <script> // add bootstrap table styles to pandoc tables $(document).ready(function () { $('tr.header').parent('thead').parent('table').addClass('table table-condensed'); }); </script> <!-- dynamically load mathjax for compatibility with self-contained --> <script> (function () { var script = document.createElement("script"); script.type = "text/javascript"; script.src = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"; document.getElementsByTagName("head")[0].appendChild(script); })(); </script> </body> </html>