# # ##### More on Regression. # # Demonstrating the importance (and accuracy) of RANDOM sampling, CORRECT model specification, # and MINIMIZING measurement error. # # First, create the population (N = 100000) # NOTE: variable 'x4' is not included in 'y' and # 'p.code' is simply an identifier for each case). n <- 100000 p.code <- seq(1:n) x1 <- round(rnorm(n, 35, 5),2) x2 <- round(rnorm(n, 50, 7),2) x3 <- round(rnorm(n, 25, 5),2) x4 <- round(rnorm(n, 50, 10),2) y <- 5.5 + 2.5*x1 + -1.2*x2 + .2*x3 pop.df <- data.frame(p.code,y,x1,x2,x3,x4) rm(n,p.code,y,x1,x2,x3,x4) summary(pop.df) # Notice there is virtually no multicollinearity. cor(pop.df) # Verify the Population Model (pop.mod). pop.model <- lm(y ~ x1 + x2 + x3, data = pop.df) summary(pop.model) # Now, draw a RANDOM sample of cases (n = 50) from the population; selecting # cases by their population code number (p.code). pop.code <- sort(sample(pop.df$p.code, 50, replace = FALSE)) samp.df.1 <- pop.df[pop.code,] s.code <- seq(1:50) samp.1 <- cbind(s.code,samp.df.1) rm(pop.code,s.code,samp.df.1) summary(samp.1) cor(samp.1) # Standard regression: stunningly accurate for only 50 of 100000 cases; BUT, this # is with random sampling, perfect measurement (i.e. no error), perfect model # specification, and very little multicollinearity. model.1 <- lm(y ~ x1 + x2 + x3, data = samp.1) summary(model.1) ### Model specification errors. # Notice that having a meaningless variable (x4) in the model does not make # much of a difference. model.1a <- lm(y ~ x1 + x2 + x3 + x4, data = samp.1) summary(model.1a) # However, leaving out one variable (especially an important one like 'x1' here), # can have a substantial impact on model fit (e.g. R-squared). model.2 <- lm(y ~ x2 + x3 + x4, data = samp.1) summary(model.2) model.3 <- lm(y ~ x3 + x4, data = samp.1) summary(model.3) model.4 <- lm(y ~ x4, data = samp.1) summary(model.4) model.5 <- lm(y ~ x1, data = samp.1) summary(model.5) ### Measurement error. # This time, when we draw our sample from the population, # each variable is measured with a little error (rnorm). pop.code <- sort(sample(pop.df$p.code, 50, replace = FALSE)) samp.df.2 <- pop.df[pop.code,] s.code <- seq(1:50) samp.2 <- cbind(s.code,samp.df.2) samp.2["y"] <- samp.2["y"] + rnorm(50) samp.2["x1"] <- samp.2["x1"] + rnorm(50) samp.2["x2"] <- samp.2["x2"] + rnorm(50) samp.2["x3"] <- samp.2["x3"] + rnorm(50) rm(pop.code,s.code,samp.df.2) summary(samp.2) # Note: there may be some multicollinearity here (x1 & x4; x2 & x3; x2 & x4). cor(samp.2) # The 'small' amount of measurement error (and slight multicollinearity) does not # substantially change our model's fit. model.5 <- lm(y ~ x1 + x2 + x3 + x4, data = samp.2) summary(model.5) model.6 <- lm(y ~ x1 + x2 + x3, data = samp.2) summary(model.6) # However, with slightly greater measurement error (perhaps more realistic in the # social sciences), we see a noticable decrease in model fit. pop.code <- sort(sample(pop.df$p.code, 50, replace = FALSE)) samp.df.3 <- pop.df[pop.code,] s.code <- seq(1:50) samp.3 <- cbind(s.code,samp.df.3) samp.3["y"] <- samp.3["y"] + rnorm(50, 0, 2) samp.3["x1"] <- samp.3["x1"] + rnorm(50, 0, 2) samp.3["x2"] <- samp.3["x2"] + rnorm(50, 0, 2) samp.3["x3"] <- samp.3["x3"] + rnorm(50, 0, 2) rm(pop.code,s.code,samp.df.3) summary(samp.3) # Here, multicollinearity is not bad. cor(samp.3) # Still the fit (R-square & Adjusted R-square) is pretty good. model.7 <- lm(y ~ x1 + x2 + x3 + x4, data = samp.3) summary(model.7) model.8 <- lm(y ~ x1 + x2 + x3, data = samp.3) summary(model.8) # Adding even more measurement error further degrades the fit. pop.code <- sort(sample(pop.df$p.code, 50, replace = FALSE)) samp.df.4 <- pop.df[pop.code,] s.code <- seq(1:50) samp.4 <- cbind(s.code,samp.df.4) samp.4["y"] <- samp.4["y"] + rnorm(50, 0, 4) samp.4["x1"] <- samp.4["x1"] + rnorm(50, 0, 4) samp.4["x2"] <- samp.4["x2"] + rnorm(50, 0, 4) samp.4["x3"] <- samp.4["x3"] + rnorm(50, 0, 4) rm(pop.code,s.code,samp.df.4) summary(samp.4) # Little multicollinearity. cor(samp.4) # Fit further reduced; but, still not terrible. model.9 <- lm(y ~ x1 + x2 + x3 + x4, data = samp.4) summary(model.9) model.10 <- lm(y ~ x1 + x2 + x3, data = samp.4) summary(model.10) ##### # Now suppose we wanted to see how well our model did across a validation # trial. # We draw a random sample (n = 100) from the population and split it in half; # one half (samp.a) is used to fit the model and the other half, sometimes called the # 'holdout sample' (samp.p) will be used to evaluate or validate the predictions from # the first half. pop.code <- sort(sample(pop.df$p.code, 100, replace = FALSE)) samp.df.z <- pop.df[pop.code,] s.code <- seq(1:50) samp.z <- cbind(s.code,samp.df.z) samp.a <- samp.z[1:50,] samp.b <- samp.z[51:100,] rm(pop.code,s.code,samp.df.z,samp.z) summary(samp.a) model.11 <- lm(y ~ x1 + x2 + x3, data = samp.a) summary(model.11) # What we need to do is see how well the coefficients from the model we # fit with the first sample (samp.a) can predict values of 'y' in the # second sample (samp.b). # First, call the coefficients from 'model.11' (generated by samp.a) and # assign them a (slightly) shorter name. m11.coef <- model.11$coefficients # Then, create the variables which contains the predicted values of 'y' # using the data from samp.b and the coefficients generated by samp.a. # This will essentially be a linear model specification. pred.b <- m11.coef[1] + m11.coef[2]*samp.b$x1 + m11.coef[3]*samp.b$x2 + m11.coef[4]*samp.b$x3 rm(m11.coef) # Now, how well do these compare to the actual values of 'y' in 'samp.b'? t.test(pred.b, samp.b$y) # How well are 'pred.b' and 'samp.b$y' related? cor(pred.b, samp.b$y) # What are the residuals (hopefully small)? The average of these residuals can # be considered a 'shrinkage factor'. pred.b - samp.b$y summary(pred.b - samp.b$y) hist(pred.b - samp.b$y) shrink.b <- mean(pred.b - samp.b$y) shrink.b # Of course, we could go in the opposite direction as well, using 'samp.b' # to create a model (coefficents) to then predict the values of 'y' in # 'samp.a'. model.12 <- lm(y ~ x1 + x2 + x3, data = samp.b) summary(model.12) m12.coef <- model.12$coefficients pred.a <- m12.coef[1] + m12.coef[2]*samp.a$x1 + m12.coef[3]*samp.a$x2 + m12.coef[4]*samp.a$x3 rm(m12.coef) t.test(pred.a, samp.a$y) cor(pred.a, samp.a$y) pred.a - samp.a$y summary(pred.a - samp.a$y) hist(pred.a - samp.a$y) shrink.a <- mean(pred.a - samp.a$y) shrink.a ### Bootstrapping. # Here using the a sample ('samp.2') which has a little measurement error. # First, create some empty vectors; which will store parameter values # returned from each re-sample. boot.intercept <- as.vector(0) boot.x1.coef <- as.vector(0) boot.x2.coef <- as.vector(0) boot.x3.coef <- as.vector(0) boot.rsq <- as.vector(0) # Then, run the function. for(i in 1:5000){ b.code <- sort(sample(samp.2$s.code, 100, replace = TRUE)) boot.df <- samp.2[b.code,] mod <- lm(y ~ x1 + x2 + x3, boot.df) boot.intercept[i] <- mod$coefficients[[1]] boot.x1.coef[i] <- mod$coefficients[[2]] boot.x2.coef[i] <- mod$coefficients[[3]] boot.x3.coef[i] <- mod$coefficients[[4]] boot.rsq[i] <- summary(mod)$r.square rm(b.code, boot.df, mod) } # Take a look at the Empirical Distribution(s) of the coefficients. quantile(boot.intercept, probs = c(.025, .5, .975)) quantile(boot.x1.coef, probs = c(.025, .5, .975)) quantile(boot.x2.coef, probs = c(.025, .5, .975)) quantile(boot.x3.coef, probs = c(.025, .5, .975)) quantile(boot.rsq, probs = c(.025, .5, .975)) plot(density(boot.intercept)) plot(density(boot.x1.coef)) plot(density(boot.x2.coef)) plot(density(boot.x3.coef)) hist(boot.rsq, col = "lightblue1", prob = T) lines(density(boot.rsq), col = "blue") # For a more thorough discussion (and demonstration in R) of Regression # assumptions and evaluating/testing them, see: # http://www.unt.edu/benchmarks/archives/2008/june08/rss.htm # End: Jan. 2011.