#
#
###### Using the 'MatchIt' package to remove the influence of covariates on a causal model.
#
# 'MatchIt' has several methods which can be used to 'balance' the effects of one or
# more covariates (i.e. confounds) across the two levels of a treatment variable (e.g.
# dichotomous 'grouping variable' predictor).
# First, read in the example data from the web naming it "data.df", get a summary, and
# take note of the number of rows (nrow). This data is simulated and was created specifically
# as an example for discussing matching in a regression situation. In the summary output
# notice that all variables are numeric; although the dichotomous grouping variable (0 = control
# & 1 = treatment) is g1. The covariates (confounder variables) are c1 and c2; with c1 being
# dichotomous and c2 being continuous. The continuous outcome variable is y1. Both x2 and x3
# are continuous predictors of y1 along with the grouping variable (g1); but x2 and x3 are
# not related to g1, c1, or c2.
data.df <- read.table("http://www.unt.edu/rss/class/Jon/Benchmarks/Data4MatchItExample.txt",
header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE)
summary(data.df)
nrow(data.df)
# Load the 'MatchIt' package.
search()
library(MatchIt)
search()
# Next, we run our first 'matchit' with all the default values for each argument specified
# and get a summary of the results.
m.out.1 <- matchit(formula = g1 ~ c1 + c2 + x2 + x3, data = data.df, method = "nearest",
distance = "logit", discard = "none", reestimate = "FALSE")
summary(m.out.1)
# There are four pieces of output produced by the summary function. The summary of balance
# for "all data" (i.e. the original data), the summary of balance for the "matched data",
# the percentages of balance improvement, and the sample size summaries.
# The output of the summary on 'm.out.1' reveals rather strikingly perfect matching. The
# key elements to focus on are the 'Mean Diff' for the "all data" compared to the 'Mean
# Diff' for the "matched data" -- notice there are no differences. This is confirmed by
# noting the Percent Balance Improvement where the zero values indicate
# a zero percentage change.
# Furthermore, notice that all the control cases were retained and all the treated cases.
# Essentially, nothing has been done; because, each control case was matched to each treated
# case on the distance measure; no selection has taken place based on the distance measure.
# One could use the 'discard' optional argument to specify a distance criterion; or specify
# a region of common support.
# This first example also points out why one should not simply take for granted the
# default values for arguments of a function.
rm(m.out.1)
# As a comparison; and to show the reason one would want to use the 'matchit' function,
# we run an example using the "genetic" (algorithm) method, "rpart" distance, and
# discard "hull.control" which retains all the treatment cases.
m.out.2 <- matchit(formula = g1 ~ c1 + c2 + x2 + x3, data = data.df, method = "genetic",
distance = "rpart", discard = "hull.control", reestimate = "FALSE")
summary(m.out.2)
# In this summary, we notice that although the mean differences were drastically reduced
# for the two covariates (c1 & c2), the mean difference actually increased
# for one of the two predictors (x2). This is a result of those two
# predictors NOT being related to the grouping variable. So, we might run a third
# version of the 'matchit' function; including only the two covariates.
m.out.3 <- matchit(formula = g1 ~ c1 + c2, data = data.df, method = "genetic",
distance = "rpart", discard = "hull.control", reestimate =
"FALSE")
summary(m.out.3)
# With this summary (m.out.3) we see large reductions in the mean differences and
# the corresponding percent balance improvements.
# We can also adjust the 'GenMatch' function which is called by the method = "genetic"
# to better take advantage of the 'GenMatch'; possibly improving the results, but also
# possibly reducing them. Below, the defaults are shown -- which produce the same
# output as the previous run (m.out.3).
m.out.4 <- matchit(formula = g1 ~ c1 + c2, data = data.df, method = "genetic",
distance = "rpart",
pop.size = 15, max.generations = 100, wait.generations = 100,
fit.func = "pvals", nboots = 0,
discard = "hull.control", reestimate = "FALSE")
summary(m.out.4)
# In order to retrieve or create the new matched data set based on the output from
# the 'matchit' function, we must do some rather tedious scripting, first, selecting
# the matched cases by case or row number, then creating a grouping variable to
# identify each group, then renaming each data frame's columns so they will
# match when we finally row-bind (rbind) them back together.
m.data <- data.frame(cbind(data.df[row.names(m.out.4$match.matrix),c("c1","c2","g1","x2","x3","y")],
data.df[m.out.4$match.matrix,c("c1","c2","g1","x2","x3","y")]))
head(m.data)
m.data.1 <- cbind(rep("1", length(m.data[,1])), m.data[,1:6])
m.data.0 <- cbind(rep("0", length(m.data[,3])), m.data[,7:12])
head(m.data.1)
head(m.data.0)
names(m.data.1) <- c("Group", "c1", "c2", "g1", "x2", "x3", "y")
names(m.data.0) <- c("Group", "c1", "c2", "g1", "x2", "x3", "y")
head(m.data.1)
head(m.data.0)
matched.data <- rbind(m.data.1,m.data.0)
matched.data
matched.data <- data.frame(matched.data)
head(matched.data)
nrow(matched.data)
ncol(matched.data)
head(data.df)
nrow(data.df)
ncol(data.df)
# Keep in mind, this new (matched) data can be saved or written out of R using the
# standard functions (e.g. write.table, write.csv, etc.).
help(write.table)
help(write.csv)
# Now we can do some comparisons to see how the matching has affected our analysis
# of the data; in terms of the proposed model we will now test. Here we are using
# a simple linear model, but keep in mind the model could be a complex SEM or HLM
# or whatever.
# First, the original data:
lm.original <- lm(y ~ g1 + x2 + x3 + c1 + c2, data = data.df)
summary(lm.original)
# Now with the matched data:
lm.matched <- lm(y ~ g1 + x2 + x3 + c1 + c2, data = matched.data)
summary(lm.matched)
# It may be a bit odd that the output of each of the linear models is virtually
# the same. That is because the simulated data was created in such a way as to
# have extremely large effect sizes. However, if we look closely at the standard
# errors and the t-values we can see that the linear model with the matched
# data is more accurately capturing the main effects of each variable; because
# we have decreased the strength of the relationships between the grouping
# variable and the covariates
# We can also document improvement by taking a look at the Variance Inflation
# Factor (VIF) for the grouping variable and both covariates. Notice, the VIF
# for the matched variables (g1, c1, c2) is substantially lower.
library(car)
vif(lm.original)
vif(lm.matched)
####
# Cleaning up the workspace.
search()
detach("package:relaimpo")
detach("package:corpcor")
detach("package:mitools")
detach("package:survey")
detach("package:boot")
detach("package:car")
detach("package:survival")
detach("package:splines")
detach("package:nnet")
detach("package:MatchIt")
detach("package:Matching")
detach("package:MASS")
detach("package:rgenoud")
detach("package:WhatIf")
detach("package:lpSolve")
detach("package:rpart")
search()
ls()
rm(data.df, lm.matched, lm.original, m.data, m.data.0, m.data.1, m.out.2, m.out.3,
m.out.4, matched.data)
ls()
################################################################################
#
##### The script below shows how the data was created.
#
################################################################################
# Creating data for exploring MatchIt.
library(arm)
n <- 160
bc0 <- 1
c1 <- rbinom(n, 1, .5)
c1 <- recode(c1, "'0' = '-1'; '1' = '1'")
bc1 <- 2.5
c2 <- rnorm(n)
bc2 <- 1.5
g1 <- rbinom(n, 1, invlogit(bc0 + bc1*c1 + bc2*c2))
id <- which(g1 == 1)
df.1 <- data.frame(c1, c2, g1)
df.1 <- df.1[id,]
df.1 <- df.1[1:50,]
rm(bc0,c1,bc1,c2,bc2,g1,id)
bc0 <- 1
c1 <- rbinom(n, 1, .5)
c1 <- recode(c1, "'0' = '-1'; '1' = '1'")
bc1 <- 2.0
c2 <- rnorm(n)
bc2 <- 1.0
g1 <- rbinom(n, 1, invlogit(bc0 + bc1*c1 + bc2*c2))
id <- which(g1 == 0)
df.0 <- data.frame(c1, c2, g1)
df.0 <- df.0[id,]
df.0 <- df.0[1:50,]
rm(bc0,c1,bc1,c2,bc2,g1,id)
df <- rbind(df.1,df.0)
rm(df.1,df.0)
head(df,10)
tail(df,10)
nrow(df)
attach(df)
n <- 100
x2 <- rnorm(n)
x3 <- rnorm(n)
b0 <- 2
b1 <- 3.5
b2 <- -.9
b3 <- .5
bc1 <- 1.5
bc2 <- .5
y <- b0 + b1*g1 + b2*x2 + b3*x3 + bc1*c1 + bc2*c2
detach(df)
data.df <- data.frame(df,x2,x3,y)
summary(data.df)
cor(data.df)
lm.x <- lm(y ~ g1 + x2 + x3 + c1 + c2, data.df)
summary(lm.x)
summary(bayesglm(g1 ~ c1 + c2, data.df, family = binomial("logit")))
ls()
rm(b0,b1,b2,b3,bc1,bc2,df,lm.x,n,x2,x3,y)
ls()
search()
detach("package:arm")
detach("package:lme4")
detach("package:Matrix")
detach("package:R2WinBUGS")
detach("package:coda")
detach("package:lattice")
detach("package:abind")
detach("package:car")
detach("package:nnet")
detach("package:survival")
detach("package:splines")
detach("package:MASS")
detach("package:foreign")
search()
# End; Apr. 2, 2011.