# # New material added March 6, 2012 (located at the bottom). # New material added April 8, 2011 (located near the bottom). # New material added November 4, 2010 (located near the bottom). # ############### Some Missing Value Imputation ############### # # This script assumes you have worked through all the previous notes from # the web page and you have downloaded, installed, and updated all available # R packages. # Load the following libraries if you have not already; when an additional # library is needed, it will be listed in the script and loaded. library(foreign) library(Rcmdr) ########## Using library 'VIM' to create a new data.frame with Iterative Robust ## Model-based Imputation (IRMI) values in place of missing values. ## *Note: I generally ignore the GUI that pops up. library(VIM) ### From VIM help documentation. # Notice in the output (from the functions below) that R recognizes missing # values as 'NA'. data(tao) summary(tao) nrow(tao) names(tao) na.fail(tao$Year) na.fail(tao$Sea.Surface.Temp) na.fail(tao$Humidity) summary(tao$Year) summary(tao$Sea.Surface.Temp) summary(tao$Humidity) # Create a new data frame with the imputed values in place of the missing ones. imputed.tao <- irmi(tao) summary(tao) summary(imputed.tao) hist(tao$Humidity) hist(imputed.tao$Humidity) ########## Using the library 'Amelia' to create new data sets which include ## multiple imputation of incomplete multivariate data values in place of ## missing values...by running the bootstrap EM algorithm. library(Amelia) ### Example from library 'Amelia' documentation. data(africa) summary(africa) nrow(africa) # Use 'amelia' function to create the new data set(s). # Notice the summary (below) tells us there were "5 imputed datasets" created. # We could increase the number of data sets created by changing m=5 (default) # to m=whatever number of data sets we want. a.out <- amelia(x=africa,m=5,cs="country",ts="year",logs="gdp_pc") summary(a.out) plot(a.out) # Now that we have used the 'amelia' function; we need to write the data sets # created and store them in (by default) in our source directory. # The following function 'write.amelia' takes all the imputed data sets created # using the 'amelia' function and writes them as new new data files. In this # case, specified with the names: 'africa.outdata1.csv", "africa.outdata2.csv", # "africa.outdata3.csv", etc. The 'csv' extension refers to 'comma separated # values' which is a form of text (.txt) data file with values separated by # comma. write.amelia(a.out,"africa.outdata",extension=NULL,format="csv") # Now we can load any of these 5 new data sets into R from the source directory. # Generally, the last iteratively produced data set offers the best estimates # of the missing values/data because, it is based on previous estimates # (aka. priors). a.out5 <- read.table("C:/Documents and Settings/jds0282/Desktop/Work_Stuff/Jon_R/africa.outdata5", header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE) summary(a.out5) # However, if you are interested in combining the estimates; you would need to # do multiple runs of the 'amelia' function and then average the last data set # from each run. ########## Using the library 'mvnmle' to create a complete variance/covariance ## matrix which will include maximum likelihood estimates for missing values. # Note; this is different from the two methods above, in that here we are not # concerned with retrieving a new (imputed) data file and instead ARE concerned # only with a complete variance/covariance matrix based on maximum likelihood # values imputed where previously missing values existed. This can be useful # for some multivariate analysis (e.g. SEM, PCA, EFA, etc.) library(mvnmle) ### From the help file on mlest. data(apple) apple summary(apple) names(apple) # Covariance matrix for 'apple'. cov(apple) # Notice that because of the 6 missing values on the variable 'worms' we get # 'NA' for 3 of the 4 elements of the variance/covariance matrix. # Multiple imputation using the 'mlest' function. mlest(apple) # Get only the covariance matrix from the 'mlest' output. mlest(apple)$sigmahat imputed.cov.apple <- mlest(apple)$sigmahat imputed.cov.apple ### Another example from the 'mvnmle' help file for the function 'mlest'. data(missvals) missvals summary(missvals) cov(missvals) imputed.cov.missvals <- mlest(missvals)$sigmahat imputed.cov.missvals ########## Practical Examples ########## # Some house cleaning. Also, do not forget to go into your source directory and # delete the imputed example data files created by the 'Amelia' functions. ls() rm(a.out, a.out1, a.out2, a.out5, africa, apple, chorizonDL, imputed.cov.apple, imputed.cov.missvals, imputed.tao, kola.background, missvals, sleep, tao, a.out.avg, a.out3, a.out4, avg.gdp_pc, avg.trade) ls() ##### Practical examples using 'Example Data 7' from the web page. # Read in 'ExampleData7.sav' from the web page using the 'foreign' library. example7 <- read.spss("http://www.unt.edu/rss/class/Jon/R_SC/Module4/ExampleData7.sav", use.value.labels=TRUE, max.value.labels=Inf, to.data.frame=TRUE) head(example7) attach(example7) # Missing values na.fail(example7) names(example7) na.fail(age) summary(age) # No missing values. na.fail(Q1) summary(Q1) # Build a subset of just the variables/items of interest. subset1 <- as.data.frame(cbind(age,Q1,Q2,Q3,Q4,Q5)) detach(example7) attach(subset1) summary(age) ### Using 'Amelia' library and 'amelia' function. subset1.out <- amelia(x=subset1) summary(subset1.out) plot(subset1.out) # Write the data files out to the source directory. write.amelia(subset1.out,"subset1.outdata",extension=NULL,format="csv") # Load the last (5th) imputed data set back into R from the source directory; your source directory will obviously # be different than mine (listed below as a file path). subset1.out5 <- read.table("C:/Documents and Settings/jds0282/Desktop/Work_Stuff/Jon_R/subset1.outdata5", header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE) summary(subset1.out5) names(subset1.out5) # Use the new imputed data set (subset1.out5) to create a new 'subset' for # use in comparison functions (e.g. cov). detach(subset1) attach(subset1.out5) subset2 <- as.data.frame(cbind(age,Q1,Q2,Q3,Q4,Q5)) detach(subset1.out5) # Compare to make sure the imputations cleared the missing data (NA). summary(subset1$age) summary(subset2$age) cov(subset1) cov(subset2) ### Using 'mvnmle' library and 'mlest' function. # Covariance matrix. cov(subset1) # Apply the 'mlest' function; then get just the covariance matrix. mlest(subset1) cov(subset1) mlest(subset1)$sigmahat # Remember, each of the methods above (irmi, amelia, & mlest) are achieving missing # value imputation in different (and iterative) ways. So, they will often # provide slightly different estimates; be those estimates individual values # or a variance/covariance matrix. ## Again, do not forget to go into your source directory and delete imputed ## example data files created by the 'Amelia' functions. In practice, you would ## likely keep the single imputed data file you used for analysis. ################################################################################ # ### New material; added Nov. 4, 2010. # # Using the library 'SeqKnn' to do sequential k nearest neighbor missing value # imputation. Nearest neighbors are records that have similar completed data # patterns; within a variable, the average of the k-nearest neighbor’s completed # data are used to impute the value for a variable that is missing it’s value # (where k can be set by the analyst or R user). Hastie, et al., (1999) have # shown a k ranging from 5 to 10 is adequate. The advantage of the knn approach # is that it assumes data are missing at random (MAR) meaning, missing data only # depends on the observed data; which in turn means, the knn approach is able to # take advantage of multivariate relationships in the completed data. The # disadvantage of this approach is it does not include a component to model # random variation; consequently uncertainty in the imputed value is underestimated. # Hastie, T., Tibshirani, R., Sherlock, G., Eisen, M., Brown, P. and Botstein, D., Imputing # Missing Data for Gene Expression Arrays, Stanford University Statistics Department # Technical report (1999), # http://www-stat.stanford.edu/~hastie/Papers/missing.pdf library(SeqKnn) data(khan05) imputedData<-SeqKNN(khan05,10) # Most variables in the origianl (khan05) data contained missing values (NAs). summary(khan05) summary(imputedData) ## Package 'rrcovNA' also has a function for sequential nearest neighbor, as # well as a function which uses a robust estimator to determine the distances # between the neighboring cases. library(rrcovNA) data(bush10) summary(bush10) imputed.b10 <- impSeq(bush10) summary(imputed.b10) rob.imputed.b10 <- impSeqRob(bush10, alpha=0.9) summary(rob.imputed.b10$x) ################################################################################ # ### New material; added Apr. 8, 2011. # # Use maximum likelihood imputation with the 'norm' package and the 'em.norm', # the 'prelim.norm', and the 'imp.norm' functions. The 'imp.function' returns # a data.matrix by default. library(norm) # Data must be in matrix form. subset1.m <- data.matrix(subset1) # Preliminary sorting and centering. pre <- prelim.norm(subset1.m) # The actual maximum likelihood function. mle <- em.norm(pre) # Retrieving the complete (imputed and original) data. rngseed(1234567) # Must set the random seed generator. mle.imputed <- imp.norm(pre, mle, data.matrix(subset1.m)) summary(subset1) summary(mle.imputed) # Keep in mind, you can then save/export the imputed data using any of the base # functions; such as 'write.table' or 'write.csv' help(write.table) ################################################################################ # ### New material; added Mar. 6, 2012. # # Using the Random Recursive Partitioning (rrp) imputation technique from the # package 'rrp' and the 'rrp.impute' function. This function is very slow, but # it is the function we (RSS) recommend for missing value imputation; because... # it provides estimates with very low bias and variance, and it accepts a data # frame with numeric and factor variables. Based on our testing, maximum # likelihood ('norm' package), sequenital k nearest neigbors ('SeqKnn' package & # 'rrcovNA' package), and random recursive partitioning ('rrp' package) all # provide very similarly accurate estimates (low bias and little variance). ##### October 2, 2012. ##### NOTE: With the newer versions of R (e.g., 2.15.1); package rrp is no longer ##### listed at CRAN. However, the package can be downloaded and installed from ##### R-forge by simply running the line below: install.packages("rrp", repos="http://R-Forge.R-project.org") ##### To view the updates to this package, see the R-forge page: ##### https://r-forge.r-project.org/R/?group_id=1480 # If you would like to review the rrp package manual, it is available here: # http://www.unt.edu/rss/class/Jon/R_SC/Module4/rrp.pdf summary(subset1) library(rrp) rrp.imputed <- rrp.impute(subset1, k = 8, msplit = 10, Rep = 250, cut.in = 15)$new.data summary(subset1) summary(rrp.imputed) # Last updated August 30, 2012.