#
# New material added January 28, 2014 (located at the Top & Bottom).
# New material added March 6, 2012 (located at the bottom).
# New material added April 8, 2011 (located near the bottom).
# New material added November 4, 2010 (located near the bottom).
#
############### Some Missing Value Imputation ###############
#
# This script assumes you have worked through all the previous notes from
# the web page and you have downloaded, installed, and updated all available
# R packages.
################################################################################
#
# Newest material added: January 28, 2014
#
####### <<< ****** Random Recursive Partitioning (RRP) ****** >>>
#
# Using the Random Recursive Partitioning (rrp) imputation technique from the
# package 'rrp' and the 'rrp.impute' function. This function is very slow, but
# it is the function we (RSS) recommend for missing value imputation; because...
# it provides estimates with very low bias and variance, and it accepts a data
# frame with numeric and factor variables. Based on our testing, maximum
# likelihood ('norm' package), sequenital k nearest neigbors ('SeqKnn' package &
# 'rrcovNA' package), and random recursive partitioning ('rrp' package) all
# provide very similarly accurate estimates (i.e. low bias and little variance).
#
## NOTE: With the newer versions of R (e.g., 2.15.1); package rrp no longer
## listed at CRAN because it no longer works properly with the new versions
## of R (e.g., 2.15.1). However, the package can be downloaded and installed
## into older versions of R; the newest version of R in which the rrp
## package will work properly is R 2.15.0 which can be downloaded
## and installed from CRAN here:
## http://cran.r-project.org/bin/windows/base/old/2.15.0/
##
## An article explaining how to do this is available from:
## http://www.unt.edu/rss/class/Jon/Benchmarks/rrpImpute_JDS_Feb2014.pdf
##
## To view the updates to this package, see the R-forge page:
## https://r-forge.r-project.org/R/?group_id=1480
##
## If you would like to review the rrp package manual, it is available here:
## http://www.unt.edu/rss/class/Jon/R_SC/Module4/rrp.pdf
#
#<<< January 28, 2014 >>>#
#<<< http://www.unt.edu/rss/class/Jon/Benchmarks/rrpImpute_JDS_Feb2014.pdf >>>#
#<<< USING R 2.15.0 AND THE ARTICLE AT THE URL ABOVE:>>>#
# Import the example data (which is simulated).
data.1 <- read.table(
"http://www.unt.edu/rss/class/Jon/Benchmarks/rrp.ex.data.txt",
header = TRUE, sep = ",", na.strings = "NA",
dec = ".", strip.white = TRUE)
summary(data.1)
data.2 <- data.1[,-1] # Remove the row "id" column (the first column).
summary(data.2)
# Install and then load the rrp package.
install.packages("rrp", repos="http://R-Forge.R-project.org")
library(rrp)
# Set the seed (using the date) so we can replicate exactly the results
# we get.
set.seed(20140114)
# Conduct the imputation and return the new (imputed) data.
data.3 <- rrp.impute(data.2, k = 5, msplit = 10, Rep = 1000,
cut.in = 15)$new.data
# Comparison: notice there are no NA (missing values) in data.3.
summary(data.2)
summary(data.3)
# Export the newly imputed data to the Desktop.
setwd("C:/Users/jds0282/Desktop/")
write.table(data.3, file = "imputed.data.3.txt", sep = ",", na = "NA",
dec = ".", row.names = FALSE, col.names = TRUE)
# Clean up the workspace.
detach("package:rrp")
detach("package:rpart")
detach("package:MASS")
rm(data.1, data.2, data.3)
################################################################################
#
#### Older material is below.
#
# Load the following libraries if you have not already; when an additional
# library is needed, it will be listed in the script and loaded.
library(foreign)
library(Rcmdr)
########## Using library 'VIM' to create a new data.frame with Iterative Robust
## Model-based Imputation (IRMI) values in place of missing values.
## *Note: I generally ignore the GUI that pops up.
library(VIM)
### From VIM help documentation.
# Notice in the output (from the functions below) that R recognizes missing
# values as 'NA'.
data(tao)
summary(tao)
nrow(tao)
names(tao)
na.fail(tao$Year)
na.fail(tao$Sea.Surface.Temp)
na.fail(tao$Humidity)
summary(tao$Year)
summary(tao$Sea.Surface.Temp)
summary(tao$Humidity)
# Create a new data frame with the imputed values in place of the missing ones.
imputed.tao <- irmi(tao)
summary(tao)
summary(imputed.tao)
hist(tao$Humidity)
hist(imputed.tao$Humidity)
########## Using the library 'Amelia' to create new data sets which include
## multiple imputation of incomplete multivariate data values in place of
## missing values...by running the bootstrap EM algorithm.
library(Amelia)
### Example from library 'Amelia' documentation.
data(africa)
summary(africa)
nrow(africa)
# Use 'amelia' function to create the new data set(s).
# Notice the summary (below) tells us there were "5 imputed datasets" created.
# We could increase the number of data sets created by changing m=5 (default)
# to m=whatever number of data sets we want.
a.out <- amelia(x=africa,m=5,cs="country",ts="year",logs="gdp_pc")
summary(a.out)
plot(a.out)
# Now that we have used the 'amelia' function; we need to write the data sets
# created and store them in (by default) in our source directory. However,
# we can also assign the source directory (or 'working directory') to a
# location of our choosing:
setwd("C:/Users/jds0282/Desktop/Workstuff/Jon_R")
# The following function 'write.amelia' takes all the imputed data sets created
# using the 'amelia' function and writes them as new new data files. In this
# case, specified with the names: 'africa.outdata1.csv", "africa.outdata2.csv",
# "africa.outdata3.csv", etc. The 'csv' extension refers to 'comma separated
# values' which is a form of text (.txt) data file with values separated by
# comma.
write.amelia(obj = a.out, separate = TRUE, file.stem = "africa.outdata",
extension=NULL, format = "csv")
# Now we can load any of these 5 new data sets into R from the source directory.
# Generally, the last iteratively produced data set offers the best estimates
# of the missing values/data because, it is based on previous estimates
# (aka. priors).
a.out5 <- read.table("africa.outdata5", header=TRUE, sep=",",
na.strings="NA", dec=".", strip.white=TRUE)
summary(a.out5)
# However, if you are interested in combining the estimates; you would need to
# do multiple runs of the 'amelia' function and then average the last data set
# from each run.
########## Using the library 'mvnmle' to create a complete variance/covariance
## matrix which will include maximum likelihood estimates for missing values.
# Note; this is different from the two methods above, in that here we are not
# concerned with retrieving a new (imputed) data file and instead ARE concerned
# only with a complete variance/covariance matrix based on maximum likelihood
# values imputed where previously missing values existed. This can be useful
# for some multivariate analysis (e.g. SEM, PCA, EFA, etc.)
library(mvnmle)
### From the help file on mlest.
data(apple)
apple
summary(apple)
names(apple)
# Covariance matrix for 'apple'.
cov(apple)
# Notice that because of the 6 missing values on the variable 'worms' we get
# 'NA' for 3 of the 4 elements of the variance/covariance matrix.
# Multiple imputation using the 'mlest' function.
mlest(apple)
# Get only the covariance matrix from the 'mlest' output.
mlest(apple)$sigmahat
imputed.cov.apple <- mlest(apple)$sigmahat
imputed.cov.apple
### Another example from the 'mvnmle' help file for the function 'mlest'.
data(missvals)
missvals
summary(missvals)
cov(missvals)
imputed.cov.missvals <- mlest(missvals)$sigmahat
imputed.cov.missvals
########## Practical Examples ##########
# Some house cleaning. Also, do not forget to go into your source directory and
# delete the imputed example data files created by the 'Amelia' functions.
ls()
rm(a.out, a.out1, a.out2, a.out5, africa, apple, chorizonDL, imputed.cov.apple, imputed.cov.missvals,
imputed.tao, kola.background, missvals, sleep, tao, a.out.avg, a.out3, a.out4, avg.gdp_pc, avg.trade)
ls()
##### Practical examples using 'Example Data 7' from the web page.
# Read in 'ExampleData7.sav' from the web page using the 'foreign' library.
example7 <- read.spss("http://www.unt.edu/rss/class/Jon/R_SC/Module4/ExampleData7.sav", use.value.labels=TRUE,
max.value.labels=Inf, to.data.frame=TRUE)
head(example7)
attach(example7)
# Missing values
na.fail(example7)
names(example7)
na.fail(age)
is.na(age)
summary(age)
# No missing values.
na.fail(Q1)
summary(Q1)
# Build a subset of just the variables/items of interest.
subset1 <- as.data.frame(cbind(age,Q1,Q2,Q3,Q4,Q5))
detach(example7)
attach(subset1)
summary(age)
### Using 'Amelia' library and 'amelia' function.
subset1.out <- amelia(x=subset1)
summary(subset1.out)
plot(subset1.out)
# Write the data files out to the source directory.
write.amelia(subset1.out,"subset1.outdata",extension=NULL,format="csv")
# Load the last (5th) imputed data set back into R from the source directory; your source directory will obviously
# be different than mine (listed below as a file path).
subset1.out5 <- read.table("C:/Documents and Settings/jds0282/Desktop/Work_Stuff/Jon_R/subset1.outdata5", header=TRUE, sep=",",
na.strings="NA", dec=".", strip.white=TRUE)
summary(subset1.out5)
names(subset1.out5)
# Use the new imputed data set (subset1.out5) to create a new 'subset' for
# use in comparison functions (e.g. cov).
detach(subset1)
attach(subset1.out5)
subset2 <- as.data.frame(cbind(age,Q1,Q2,Q3,Q4,Q5))
detach(subset1.out5)
# Compare to make sure the imputations cleared the missing data (NA).
summary(subset1$age)
summary(subset2$age)
cov(subset1)
cov(subset2)
### Using 'mvnmle' library and 'mlest' function.
# Covariance matrix.
cov(subset1)
# Apply the 'mlest' function; then get just the covariance matrix.
mlest(subset1)
cov(subset1)
mlest(subset1)$sigmahat
# Remember, each of the methods above (irmi, amelia, & mlest) are achieving missing
# value imputation in different (and iterative) ways. So, they will often
# provide slightly different estimates; be those estimates individual values
# or a variance/covariance matrix.
## Again, do not forget to go into your source directory and delete imputed
## example data files created by the 'Amelia' functions. In practice, you would
## likely keep the single imputed data file you used for analysis.
################################################################################
#
### New material; added Nov. 4, 2010.
#
# Using the library 'SeqKnn' to do sequential k nearest neighbor missing value
# imputation. Nearest neighbors are records that have similar completed data
# patterns; within a variable, the average of the k-nearest neighbor’s completed
# data are used to impute the value for a variable that is missing it’s value
# (where k can be set by the analyst or R user). Hastie, et al., (1999) have
# shown a k ranging from 5 to 10 is adequate. The advantage of the knn approach
# is that it assumes data are missing at random (MAR) meaning, missing data only
# depends on the observed data; which in turn means, the knn approach is able to
# take advantage of multivariate relationships in the completed data. The
# disadvantage of this approach is it does not include a component to model
# random variation; consequently uncertainty in the imputed value is underestimated.
# Hastie, T., Tibshirani, R., Sherlock, G., Eisen, M., Brown, P. and Botstein, D., Imputing
# Missing Data for Gene Expression Arrays, Stanford University Statistics Department
# Technical report (1999),
# http://www-stat.stanford.edu/~hastie/Papers/missing.pdf
library(SeqKnn)
data(khan05)
imputedData<-SeqKNN(khan05,10)
# Most variables in the origianl (khan05) data contained missing values (NAs).
summary(khan05)
summary(imputedData)
## Package 'rrcovNA' also has a function for sequential nearest neighbor, as
# well as a function which uses a robust estimator to determine the distances
# between the neighboring cases.
library(rrcovNA)
data(bush10)
summary(bush10)
imputed.b10 <- impSeq(bush10)
summary(imputed.b10)
rob.imputed.b10 <- impSeqRob(bush10, alpha=0.9)
summary(rob.imputed.b10$x)
################################################################################
#
### New material; added Apr. 8, 2011.
#
# Use maximum likelihood imputation with the 'norm' package and the 'em.norm',
# the 'prelim.norm', and the 'imp.norm' functions. The 'imp.function' returns
# a data.matrix by default.
library(norm)
# Data must be in matrix form.
subset1.m <- data.matrix(subset1)
# Preliminary sorting and centering.
pre <- prelim.norm(subset1.m)
# The actual maximum likelihood function.
mle <- em.norm(pre)
# Retrieving the complete (imputed and original) data.
rngseed(1234567) # Must set the random seed generator.
mle.imputed <- imp.norm(pre, mle, data.matrix(subset1.m))
summary(subset1)
summary(mle.imputed)
# Keep in mind, you can then save/export the imputed data using any of the base
# functions; such as 'write.table' or 'write.csv'
help(write.table)
################################################################################
#
####### <<< ****** Random Recursive Partitioning (RRP) ****** >>>
#
#
### New material; added Mar. 6, 2012.
#
# Using the Random Recursive Partitioning (rrp) imputation technique from the
# package 'rrp' and the 'rrp.impute' function. This function is very slow, but
# it is the function we (RSS) recommend for missing value imputation; because...
# it provides estimates with very low bias and variance, and it accepts a data
# frame with numeric and factor variables. Based on our testing, maximum
# likelihood ('norm' package), sequenital k nearest neigbors ('SeqKnn' package &
# 'rrcovNA' package), and random recursive partitioning ('rrp' package) all
# provide very similarly accurate estimates (low bias and little variance).
##### October 2, 2012. & January 28, 2014.
#####
##### NOTE: With the newer versions of R (e.g., 2.15.1); package rrp is no longer
##### listed at CRAN. However, the package can be downloaded and installed
##### into older versions of R; the newest version of R in which the rrp
##### package will work properly is R 2.15.0 which can be downloaded
##### and installed from CRAN. An article explaining how to do this is
##### available from
##### http://www.unt.edu/rss/class/Jon/Benchmarks/rrpImpute_JDS_Feb2014.pdf
### Package rrp is available from R-forge by simply running the line below:
install.packages("rrp", repos="http://R-Forge.R-project.org")
### To view the updates to this package, see the R-forge page:
### https://r-forge.r-project.org/R/?group_id=1480
#
# If you would like to review the rrp package manual, it is available here:
# http://www.unt.edu/rss/class/Jon/R_SC/Module4/rrp.pdf
#
### New material; added Jan. 28, 2014.
#
#<<< January 28, 2014 >>>#
#<<< http://www.unt.edu/rss/class/Jon/Benchmarks/rrpImpute_JDS_Feb2014.pdf >>>#
#<<< USING R 2.15.0 AND THE ARTICLE AT THE URL ABOVE:>>>#
data.1 <- read.table(
"http://www.unt.edu/rss/class/Jon/Benchmarks/rrp.ex.data.txt",
header = TRUE, sep = ",", na.strings = "NA",
dec = ".", strip.white = TRUE)
summary(data.1)
data.2 <- data.1[,-1]
summary(data.2)
library(rrp)
set.seed(20140114)
data.3 <- rrp.impute(data.2, k = 5, msplit = 10, Rep = 1000,
cut.in = 15)$new.data
summary(data.2)
summary(data.3)
# Last updated January 28, 2014.