Random vs Mice Imputation
David P. Nichols, MD, MPH
July 17, 2018
First, load the necessary libraries
library(tidyverse)
library(mice)rm(list = ls())
set.seed(123)
x <- round(runif(10000, 1, 10000))
y <- round(runif(10000, 1, 10000))
z <- round(runif(10000, 1, 10000))
df1 <- data_frame("x" = x, "y" = y, "z" = z)cor(df1)##               x           y             z
## x  1.0000000000 -0.01760312 -0.0005817506
## y -0.0176031198  1.00000000  0.0110057126
## z -0.0005817506  0.01100571  1.0000000000df1_m <- mice::ampute(df1,
              prop = 0.99)md.pattern(df1_m$amp)    Using the generated vector “x”, the data frame is constructed.
df2 <- data_frame("x" = df1_m$amp$x, "y" = y, "z" = z)
glimpse(df2)## Observations: 10,000
## Variables: 3
## $ x <dbl> 2876, NA, 4090, 8830, 9405, 457, NA, 8924, NA, 4567, NA, 453...
## $ y <dbl> 3107, 3246, 8703, 3287, 1258, 3563, 9307, 8752, 8202, 219, 6...
## $ z <dbl> 9911, 3023, 4338, 1606, 8230, 2082, 2850, 5463, 5891, 4423, ...missings1 <-  which(is.na(df2)) #pre-imputemiss1 <- df1$x[missings1]imps <- round(runif(10000, 0, 10000))
imps <- data_frame(imps)sum(is.na(df2$x)) ## [1] 3152sum(is.na(df2$y)) ## [1] 0sum(is.na(df2$z)) ## [1] 0df2$x <- if_else(is.na(df2$x), imps$imps, df2$x)
sum(is.na(df2$x))## [1] 0miss2 <- df2$x[missings1]t.test(df1$x, df2$x, paired = TRUE)## 
##  Paired t-test
## 
## data:  df1$x and df2$x
## t = 0.44899, df = 9999, p-value = 0.6534
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -34.50193  55.00353
## sample estimates:
## mean of the differences 
##                 10.2508options(scipen = 99)
ggplot() +
  geom_density(aes(x = df1$x), color = "blue") +
  geom_density(aes(x = df2$x), color = "red") +
  labs(title = "Random Imutation - Original vs. Imputed") Both real and imputed data are correlated
cor(df1$x, df2$x)## [1] 0.6846402#compare the missing values with the real ones
t.test(miss1, miss2, paired = TRUE)## 
##  Paired t-test
## 
## data:  miss1 and miss2
## t = 0.44895, df = 3151, p-value = 0.6535
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -109.5101  174.5533
## sample estimates:
## mean of the differences 
##                32.52157bullseye <-  0
bullseye <- if_else(miss1 == miss2, 1, 0)
sum(bullseye)## [1] 0************************************************
MICE Imputation
We will now start over and do the imputation using the MICE algorithm.rm(list = ls())
set.seed(123)x <- round(runif(10000, 1, 10000))
y <- round(runif(10000, 1, 10000))
z <- round(runif(10000, 1, 10000))
df1 <- data_frame("x" = x, "y" = y, "z" = z)
cor(df1)##               x           y             z
## x  1.0000000000 -0.01760312 -0.0005817506
## y -0.0176031198  1.00000000  0.0110057126
## z -0.0005817506  0.01100571  1.0000000000df1_m <- ampute(df1,
              prop = 0.99)
md.pattern(df1_m$amp)              ##         x    y    z     
## 89      1    1    1    0
## 3381    1    1    0    1
## 3378    1    0    1    1
## 3152    0    1    1    1
##      3152 3378 3381 9911df2 <- data_frame("x" = df1_m$amp$x, "y" = y, "z" = z)missings1 <-  which(is.na(df2)) #pre-impute
miss1 <- df1$x[missings1]
sum(is.na(df2$x))## [1] 3152sum(is.na(df2$y))## [1] 0sum(is.na(df2$z))## [1] 0df2_imp <- mice(df2) #generate the imputed data setsmissings5 <- df2_imp$imp$x
miss2 <- missings5[,1] #just use the first imputation set
miss3 <- miss2 # for later usetoadd <- nrow(df2) - length(miss3)
miss3 <- c(miss3, rep(0, toadd))
miss3 <- data_frame("x" = miss3)
df2$x <- if_else(is.na(df2$x), miss3$x, df2$x) #impute the data
sum(is.na(df2$x)) #how many NA's left?## [1] 0ggplot(df2) +
  geom_density(aes(x = df1$x), color = "blue") +
  geom_density(aes(x = df2$x), color = "red") +
  labs(title = "MICE Imutation - Original vs. Imputed")t.test(miss1, miss2, paired = TRUE)## 
##  Paired t-test
## 
## data:  miss1 and miss2
## t = -0.65864, df = 3151, p-value = 0.5102
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -187.7664   93.3387
## sample estimates:
## mean of the differences 
##               -47.21383cor(df1$x, df2$x)## [1] 0.6052733bullseye <-  0
bullseye <- if_else(miss1 == miss2, 1, 0)
sum(bullseye)## [1] 1************************************************
Now let’s see how random imputation stands up against MICE with a smaller, normally distributed dataset.rm(list = ls())
set.seed(123)
x <- round(rnorm(200, 100, 30))
y <- round(rnorm(200, 100, 30))
z <- round(rnorm(200, 100, 30))
df1 <- data_frame("x" = x, "y" = y, "z" = z)
cor(df1)##             x           y           z
## x  1.00000000 -0.02555085 -0.02958507
## y -0.02555085  1.00000000 -0.05119379
## z -0.02958507 -0.05119379  1.00000000df1_m <- mice::ampute(df1,
              prop = 0.99)
df2 <- data_frame("x" = df1_m$amp$x, "y" = y, "z" = z)
glimpse(df2)## Observations: 200
## Variables: 3
## $ x <dbl> NA, NA, 147, NA, 104, 151, 114, 62, NA, 87, 137, NA, 112, 10...
## $ y <dbl> 166, 139, 92, 116, 88, 86, 76, 82, 150, 98, 104, 107, 137, 8...
## $ z <dbl> 98, 65, 81, 99, 120, 50, 90, 123, 84, 107, 115, 108, 120, 96...missings1 <-  which(is.na(df2))
miss1 <- df1$x[missings1]
imps <- round(rnorm(200, 100, 30))
imps <- data_frame(imps)sum(is.na(df2$x)) ## [1] 56sum(is.na(df2$y)) ## [1] 0sum(is.na(df2$z)) ## [1] 0df2$x <- if_else(is.na(df2$x), imps$imps, df2$x) #impute
sum(is.na(df2$x))## [1] 0miss2 <- df2$x[missings1]
t.test(df1$x, df2$x, paired = TRUE)## 
##  Paired t-test
## 
## data:  df1$x and df2$x
## t = 0.48601, df = 199, p-value = 0.6275
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.766981  4.576981
## sample estimates:
## mean of the differences 
##                   0.905ggplot() +
  geom_density(aes(x = df1$x), color = "blue") +
  geom_density(aes(x = df2$x), color = "red") +
  labs(title = "Random Imutation - Original vs. Imputed") cor(df1$x, df2$x)## [1] 0.5919782t.test(miss1, miss2, paired = TRUE)## 
##  Paired t-test
## 
## data:  miss1 and miss2
## t = 0.4836, df = 55, p-value = 0.6306
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -10.16194  16.62622
## sample estimates:
## mean of the differences 
##                3.232143bullseye <-  0
bullseye <- if_else(miss1 == miss2, 1, 0)
sum(bullseye)## [1] 1************************************************
Now, to do it over again with MICErm(list = ls())
set.seed(123)x <- round(rnorm(200, 100, 30))
y <- round(rnorm(200, 100, 30))
z <- round(rnorm(200, 100, 30))
df1 <- data_frame("x" = x, "y" = y, "z" = z)
cor(df1)##             x           y           z
## x  1.00000000 -0.02555085 -0.02958507
## y -0.02555085  1.00000000 -0.05119379
## z -0.02958507 -0.05119379  1.00000000df1_m <- ampute(df1,
              prop = 0.99)
df2 <- data_frame("x" = df1_m$amp$x, "y" = y, "z" = z)missings1 <-  which(is.na(df2)) #pre-impute
miss1 <- df1$x[missings1]
sum(is.na(df2$x))## [1] 56sum(is.na(df2$y))## [1] 0sum(is.na(df2$z))## [1] 0df2_imp <- mice(df2)missings5 <- df2_imp$imp$x
miss2 <- missings5[,1] #just use the first imputation set
miss3 <- miss2 # for later usetoadd <- nrow(df2) - length(miss3)
miss3 <- c(miss3, rep(0, toadd))
miss3 <- data_frame("x" = miss3)
df2$x <- if_else(is.na(df2$x), miss3$x, df2$x)
sum(is.na(df2$x))## [1] 0ggplot(df2) +
  geom_density(aes(x = df1$x), color = "blue") +
  geom_density(aes(x = df2$x), color = "red") +
  labs(title = "MICE Imutation - Original vs. Imputed")t.test(miss1, miss2, paired = TRUE)## 
##  Paired t-test
## 
## data:  miss1 and miss2
## t = -0.98135, df = 55, p-value = 0.3307
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -15.210621   5.210621
## sample estimates:
## mean of the differences 
##                      -5cor(df1$x, df2$x)## [1] 0.3916698bullseye <-  0
bullseye <- if_else(miss1 == miss2, 1, 0)
sum(bullseye)## [1] 1