Random vs Mice Imputation
David P. Nichols, MD, MPH
July 17, 2018
First, load the necessary libraries
library(tidyverse)
library(mice)
Next, we generate a data frame with three variables. For simplicity, they will have no correlation and will be drawn from a uniform distributionrm(list = ls())
set.seed(123)
x <- round(runif(10000, 1, 10000))
y <- round(runif(10000, 1, 10000))
z <- round(runif(10000, 1, 10000))
df1 <- data_frame("x" = x, "y" = y, "z" = z)
Correlations are all almost zerocor(df1)
## x y z
## x 1.0000000000 -0.01760312 -0.0005817506
## y -0.0176031198 1.00000000 0.0110057126
## z -0.0005817506 0.01100571 1.0000000000
The Mice package will generate a data frame with missing values.df1_m <- mice::ampute(df1,
prop = 0.99)
Here shown diagramatically, we have generated four patterns. “0” indicates no missing values. “1” indicates a missing value in the targeted variable shown by the red square. For example, the lowest row shows a data frame with the x variable containing missing values. The y and z variables are complete. We will use the set with the missing values in x for our comparison.md.pattern(df1_m$amp)
Using the generated vector “x”, the data frame is constructed.
df2 <- data_frame("x" = df1_m$amp$x, "y" = y, "z" = z)
glimpse(df2)
## Observations: 10,000
## Variables: 3
## $ x <dbl> 2876, NA, 4090, 8830, 9405, 457, NA, 8924, NA, 4567, NA, 453...
## $ y <dbl> 3107, 3246, 8703, 3287, 1258, 3563, 9307, 8752, 8202, 219, 6...
## $ z <dbl> 9911, 3023, 4338, 1606, 8230, 2082, 2850, 5463, 5891, 4423, ...
Record the index of each missing value for future comparison.missings1 <- which(is.na(df2)) #pre-impute
These are the values located at these spots. In effect, they are the counterfactuals to the missing data.miss1 <- df1$x[missings1]
Since the x variable is drawn from a uniform distribution, we will create a pool 10000 random values to impute into the missing slots.imps <- round(runif(10000, 0, 10000))
imps <- data_frame(imps)
How many missing values are there in each variable?sum(is.na(df2$x))
## [1] 3152
sum(is.na(df2$y))
## [1] 0
sum(is.na(df2$z))
## [1] 0
Now occurs the random imputation of x. It now has no missing values.df2$x <- if_else(is.na(df2$x), imps$imps, df2$x)
sum(is.na(df2$x))
## [1] 0
These are the values that were imputed into the missing slotsmiss2 <- df2$x[missings1]
Is there a significant difference between the “real” values of a and the imputed values of x?t.test(df1$x, df2$x, paired = TRUE)
##
## Paired t-test
##
## data: df1$x and df2$x
## t = 0.44899, df = 9999, p-value = 0.6534
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -34.50193 55.00353
## sample estimates:
## mean of the differences
## 10.2508
So there is no significant difference between the original and the imputed data. Let’s look at visualizing the data.options(scipen = 99)
ggplot() +
geom_density(aes(x = df1$x), color = "blue") +
geom_density(aes(x = df2$x), color = "red") +
labs(title = "Random Imutation - Original vs. Imputed")
Both real and imputed data are correlated
cor(df1$x, df2$x)
## [1] 0.6846402
The imputed values are not significantly different from the real values.#compare the missing values with the real ones
t.test(miss1, miss2, paired = TRUE)
##
## Paired t-test
##
## data: miss1 and miss2
## t = 0.44895, df = 3151, p-value = 0.6535
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -109.5101 174.5533
## sample estimates:
## mean of the differences
## 32.52157
Did the imputation make any correct hits (bullseyes)?bullseye <- 0
bullseye <- if_else(miss1 == miss2, 1, 0)
sum(bullseye)
## [1] 0
************************************************
MICE Imputation
We will now start over and do the imputation using the MICE algorithm.rm(list = ls())
set.seed(123)
x <- round(runif(10000, 1, 10000))
y <- round(runif(10000, 1, 10000))
z <- round(runif(10000, 1, 10000))
df1 <- data_frame("x" = x, "y" = y, "z" = z)
cor(df1)
## x y z
## x 1.0000000000 -0.01760312 -0.0005817506
## y -0.0176031198 1.00000000 0.0110057126
## z -0.0005817506 0.01100571 1.0000000000
df1_m <- ampute(df1,
prop = 0.99)
md.pattern(df1_m$amp)
## x y z
## 89 1 1 1 0
## 3381 1 1 0 1
## 3378 1 0 1 1
## 3152 0 1 1 1
## 3152 3378 3381 9911
df2 <- data_frame("x" = df1_m$amp$x, "y" = y, "z" = z)
missings1 <- which(is.na(df2)) #pre-impute
miss1 <- df1$x[missings1]
sum(is.na(df2$x))
## [1] 3152
sum(is.na(df2$y))
## [1] 0
sum(is.na(df2$z))
## [1] 0
The MICE algorithmdf2_imp <- mice(df2) #generate the imputed data sets
missings5 <- df2_imp$imp$x
miss2 <- missings5[,1] #just use the first imputation set
miss3 <- miss2 # for later use
toadd <- nrow(df2) - length(miss3)
miss3 <- c(miss3, rep(0, toadd))
miss3 <- data_frame("x" = miss3)
df2$x <- if_else(is.na(df2$x), miss3$x, df2$x) #impute the data
sum(is.na(df2$x)) #how many NA's left?
## [1] 0
ggplot(df2) +
geom_density(aes(x = df1$x), color = "blue") +
geom_density(aes(x = df2$x), color = "red") +
labs(title = "MICE Imutation - Original vs. Imputed")
t.test(miss1, miss2, paired = TRUE)
##
## Paired t-test
##
## data: miss1 and miss2
## t = -0.65864, df = 3151, p-value = 0.5102
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -187.7664 93.3387
## sample estimates:
## mean of the differences
## -47.21383
cor(df1$x, df2$x)
## [1] 0.6052733
bullseye <- 0
bullseye <- if_else(miss1 == miss2, 1, 0)
sum(bullseye)
## [1] 1
************************************************
Now let’s see how random imputation stands up against MICE with a smaller, normally distributed dataset.rm(list = ls())
set.seed(123)
x <- round(rnorm(200, 100, 30))
y <- round(rnorm(200, 100, 30))
z <- round(rnorm(200, 100, 30))
df1 <- data_frame("x" = x, "y" = y, "z" = z)
cor(df1)
## x y z
## x 1.00000000 -0.02555085 -0.02958507
## y -0.02555085 1.00000000 -0.05119379
## z -0.02958507 -0.05119379 1.00000000
df1_m <- mice::ampute(df1,
prop = 0.99)
df2 <- data_frame("x" = df1_m$amp$x, "y" = y, "z" = z)
glimpse(df2)
## Observations: 200
## Variables: 3
## $ x <dbl> NA, NA, 147, NA, 104, 151, 114, 62, NA, 87, 137, NA, 112, 10...
## $ y <dbl> 166, 139, 92, 116, 88, 86, 76, 82, 150, 98, 104, 107, 137, 8...
## $ z <dbl> 98, 65, 81, 99, 120, 50, 90, 123, 84, 107, 115, 108, 120, 96...
missings1 <- which(is.na(df2))
miss1 <- df1$x[missings1]
imps <- round(rnorm(200, 100, 30))
imps <- data_frame(imps)
sum(is.na(df2$x))
## [1] 56
sum(is.na(df2$y))
## [1] 0
sum(is.na(df2$z))
## [1] 0
df2$x <- if_else(is.na(df2$x), imps$imps, df2$x) #impute
sum(is.na(df2$x))
## [1] 0
miss2 <- df2$x[missings1]
t.test(df1$x, df2$x, paired = TRUE)
##
## Paired t-test
##
## data: df1$x and df2$x
## t = 0.48601, df = 199, p-value = 0.6275
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.766981 4.576981
## sample estimates:
## mean of the differences
## 0.905
ggplot() +
geom_density(aes(x = df1$x), color = "blue") +
geom_density(aes(x = df2$x), color = "red") +
labs(title = "Random Imutation - Original vs. Imputed")
cor(df1$x, df2$x)
## [1] 0.5919782
t.test(miss1, miss2, paired = TRUE)
##
## Paired t-test
##
## data: miss1 and miss2
## t = 0.4836, df = 55, p-value = 0.6306
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -10.16194 16.62622
## sample estimates:
## mean of the differences
## 3.232143
bullseye <- 0
bullseye <- if_else(miss1 == miss2, 1, 0)
sum(bullseye)
## [1] 1
************************************************
Now, to do it over again with MICErm(list = ls())
set.seed(123)
x <- round(rnorm(200, 100, 30))
y <- round(rnorm(200, 100, 30))
z <- round(rnorm(200, 100, 30))
df1 <- data_frame("x" = x, "y" = y, "z" = z)
cor(df1)
## x y z
## x 1.00000000 -0.02555085 -0.02958507
## y -0.02555085 1.00000000 -0.05119379
## z -0.02958507 -0.05119379 1.00000000
df1_m <- ampute(df1,
prop = 0.99)
df2 <- data_frame("x" = df1_m$amp$x, "y" = y, "z" = z)
missings1 <- which(is.na(df2)) #pre-impute
miss1 <- df1$x[missings1]
sum(is.na(df2$x))
## [1] 56
sum(is.na(df2$y))
## [1] 0
sum(is.na(df2$z))
## [1] 0
The MICE algorithmdf2_imp <- mice(df2)
missings5 <- df2_imp$imp$x
miss2 <- missings5[,1] #just use the first imputation set
miss3 <- miss2 # for later use
toadd <- nrow(df2) - length(miss3)
miss3 <- c(miss3, rep(0, toadd))
miss3 <- data_frame("x" = miss3)
df2$x <- if_else(is.na(df2$x), miss3$x, df2$x)
sum(is.na(df2$x))
## [1] 0
ggplot(df2) +
geom_density(aes(x = df1$x), color = "blue") +
geom_density(aes(x = df2$x), color = "red") +
labs(title = "MICE Imutation - Original vs. Imputed")
t.test(miss1, miss2, paired = TRUE)
##
## Paired t-test
##
## data: miss1 and miss2
## t = -0.98135, df = 55, p-value = 0.3307
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -15.210621 5.210621
## sample estimates:
## mean of the differences
## -5
cor(df1$x, df2$x)
## [1] 0.3916698
bullseye <- 0
bullseye <- if_else(miss1 == miss2, 1, 0)
sum(bullseye)
## [1] 1
So, it seems that Random Imputation stands up quite well against Mice Imputation. It doesn’t have an opaque black box (at least to me) surrounding it. It certainly is more satisfying than listwise deletion (destructive), or median imputation (disfiguring). I haven’t seen this in my reading but I’m not a statistician.