# # ###### Using the 'MatchIt' package to remove the influence of covariates on a causal model. # # 'MatchIt' has several methods which can be used to 'balance' the effects of one or # more covariates (i.e. confounds) across the two levels of a treatment variable (e.g. # dichotomous 'grouping variable' predictor). # First, read in the example data from the web naming it "data.df", get a summary, and # take note of the number of rows (nrow). This data is simulated and was created specifically # as an example for discussing matching in a regression situation. In the summary output # notice that all variables are numeric; although the dichotomous grouping variable (0 = control # & 1 = treatment) is g1. The covariates (confounder variables) are c1 and c2; with c1 being # dichotomous and c2 being continuous. The continuous outcome variable is y1. Both x2 and x3 # are continuous predictors of y1 along with the grouping variable (g1); but x2 and x3 are # not related to g1, c1, or c2. data.df <- read.table("http://www.unt.edu/rss/class/Jon/Benchmarks/Data4MatchItExample.txt", header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE) summary(data.df) nrow(data.df) # Load the 'MatchIt' package. search() library(MatchIt) search() # Next, we run our first 'matchit' with all the default values for each argument specified # and get a summary of the results. m.out.1 <- matchit(formula = g1 ~ c1 + c2 + x2 + x3, data = data.df, method = "nearest", distance = "logit", discard = "none", reestimate = "FALSE") summary(m.out.1) # There are four pieces of output produced by the summary function. The summary of balance # for "all data" (i.e. the original data), the summary of balance for the "matched data", # the percentages of balance improvement, and the sample size summaries. # The output of the summary on 'm.out.1' reveals rather strikingly perfect matching. The # key elements to focus on are the 'Mean Diff' for the "all data" compared to the 'Mean # Diff' for the "matched data" -- notice there are no differences. This is confirmed by # noting the Percent Balance Improvement where the zero values indicate # a zero percentage change. # Furthermore, notice that all the control cases were retained and all the treated cases. # Essentially, nothing has been done; because, each control case was matched to each treated # case on the distance measure; no selection has taken place based on the distance measure. # One could use the 'discard' optional argument to specify a distance criterion; or specify # a region of common support. # This first example also points out why one should not simply take for granted the # default values for arguments of a function. rm(m.out.1) # As a comparison; and to show the reason one would want to use the 'matchit' function, # we run an example using the "genetic" (algorithm) method, "rpart" distance, and # discard "hull.control" which retains all the treatment cases. m.out.2 <- matchit(formula = g1 ~ c1 + c2 + x2 + x3, data = data.df, method = "genetic", distance = "rpart", discard = "hull.control", reestimate = "FALSE") summary(m.out.2) # In this summary, we notice that although the mean differences were drastically reduced # for the two covariates (c1 & c2), the mean difference actually increased # for one of the two predictors (x2). This is a result of those two # predictors NOT being related to the grouping variable. So, we might run a third # version of the 'matchit' function; including only the two covariates. m.out.3 <- matchit(formula = g1 ~ c1 + c2, data = data.df, method = "genetic", distance = "rpart", discard = "hull.control", reestimate = "FALSE") summary(m.out.3) # With this summary (m.out.3) we see large reductions in the mean differences and # the corresponding percent balance improvements. # We can also adjust the 'GenMatch' function which is called by the method = "genetic" # to better take advantage of the 'GenMatch'; possibly improving the results, but also # possibly reducing them. Below, the defaults are shown -- which produce the same # output as the previous run (m.out.3). m.out.4 <- matchit(formula = g1 ~ c1 + c2, data = data.df, method = "genetic", distance = "rpart", pop.size = 15, max.generations = 100, wait.generations = 100, fit.func = "pvals", nboots = 0, discard = "hull.control", reestimate = "FALSE") summary(m.out.4) # In order to retrieve or create the new matched data set based on the output from # the 'matchit' function, we must do some rather tedious scripting, first, selecting # the matched cases by case or row number, then creating a grouping variable to # identify each group, then renaming each data frame's columns so they will # match when we finally row-bind (rbind) them back together. m.data <- data.frame(cbind(data.df[row.names(m.out.4$match.matrix),c("c1","c2","g1","x2","x3","y")], data.df[m.out.4$match.matrix,c("c1","c2","g1","x2","x3","y")])) head(m.data) m.data.1 <- cbind(rep("1", length(m.data[,1])), m.data[,1:6]) m.data.0 <- cbind(rep("0", length(m.data[,3])), m.data[,7:12]) head(m.data.1) head(m.data.0) names(m.data.1) <- c("Group", "c1", "c2", "g1", "x2", "x3", "y") names(m.data.0) <- c("Group", "c1", "c2", "g1", "x2", "x3", "y") head(m.data.1) head(m.data.0) matched.data <- rbind(m.data.1,m.data.0) matched.data matched.data <- data.frame(matched.data) head(matched.data) nrow(matched.data) ncol(matched.data) head(data.df) nrow(data.df) ncol(data.df) # Keep in mind, this new (matched) data can be saved or written out of R using the # standard functions (e.g. write.table, write.csv, etc.). help(write.table) help(write.csv) # Now we can do some comparisons to see how the matching has affected our analysis # of the data; in terms of the proposed model we will now test. Here we are using # a simple linear model, but keep in mind the model could be a complex SEM or HLM # or whatever. # First, the original data: lm.original <- lm(y ~ g1 + x2 + x3 + c1 + c2, data = data.df) summary(lm.original) # Now with the matched data: lm.matched <- lm(y ~ g1 + x2 + x3 + c1 + c2, data = matched.data) summary(lm.matched) # It may be a bit odd that the output of each of the linear models is virtually # the same. That is because the simulated data was created in such a way as to # have extremely large effect sizes. However, if we look closely at the standard # errors and the t-values we can see that the linear model with the matched # data is more accurately capturing the main effects of each variable; because # we have decreased the strength of the relationships between the grouping # variable and the covariates # We can also document improvement by taking a look at the Variance Inflation # Factor (VIF) for the grouping variable and both covariates. Notice, the VIF # for the matched variables (g1, c1, c2) is substantially lower. library(car) vif(lm.original) vif(lm.matched) #### # Cleaning up the workspace. search() detach("package:relaimpo") detach("package:corpcor") detach("package:mitools") detach("package:survey") detach("package:boot") detach("package:car") detach("package:survival") detach("package:splines") detach("package:nnet") detach("package:MatchIt") detach("package:Matching") detach("package:MASS") detach("package:rgenoud") detach("package:WhatIf") detach("package:lpSolve") detach("package:rpart") search() ls() rm(data.df, lm.matched, lm.original, m.data, m.data.0, m.data.1, m.out.2, m.out.3, m.out.4, matched.data) ls() ################################################################################ # ##### The script below shows how the data was created. # ################################################################################ # Creating data for exploring MatchIt. library(arm) n <- 160 bc0 <- 1 c1 <- rbinom(n, 1, .5) c1 <- recode(c1, "'0' = '-1'; '1' = '1'") bc1 <- 2.5 c2 <- rnorm(n) bc2 <- 1.5 g1 <- rbinom(n, 1, invlogit(bc0 + bc1*c1 + bc2*c2)) id <- which(g1 == 1) df.1 <- data.frame(c1, c2, g1) df.1 <- df.1[id,] df.1 <- df.1[1:50,] rm(bc0,c1,bc1,c2,bc2,g1,id) bc0 <- 1 c1 <- rbinom(n, 1, .5) c1 <- recode(c1, "'0' = '-1'; '1' = '1'") bc1 <- 2.0 c2 <- rnorm(n) bc2 <- 1.0 g1 <- rbinom(n, 1, invlogit(bc0 + bc1*c1 + bc2*c2)) id <- which(g1 == 0) df.0 <- data.frame(c1, c2, g1) df.0 <- df.0[id,] df.0 <- df.0[1:50,] rm(bc0,c1,bc1,c2,bc2,g1,id) df <- rbind(df.1,df.0) rm(df.1,df.0) head(df,10) tail(df,10) nrow(df) attach(df) n <- 100 x2 <- rnorm(n) x3 <- rnorm(n) b0 <- 2 b1 <- 3.5 b2 <- -.9 b3 <- .5 bc1 <- 1.5 bc2 <- .5 y <- b0 + b1*g1 + b2*x2 + b3*x3 + bc1*c1 + bc2*c2 detach(df) data.df <- data.frame(df,x2,x3,y) summary(data.df) cor(data.df) lm.x <- lm(y ~ g1 + x2 + x3 + c1 + c2, data.df) summary(lm.x) summary(bayesglm(g1 ~ c1 + c2, data.df, family = binomial("logit"))) ls() rm(b0,b1,b2,b3,bc1,bc2,df,lm.x,n,x2,x3,y) ls() search() detach("package:arm") detach("package:lme4") detach("package:Matrix") detach("package:R2WinBUGS") detach("package:coda") detach("package:lattice") detach("package:abind") detach("package:car") detach("package:nnet") detach("package:survival") detach("package:splines") detach("package:MASS") detach("package:foreign") search() # End; Apr. 2, 2011.