# # ####### Yet another cautionary script for linear regression modelers. ####### # df1 <- read.table("http://www.unt.edu/rss/class/Jon/R_SC/Module6/caution.lm.data.001.txt", header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE) summary(df1) hist(df1$y, main = "Outcome (y)", xlab = "y", ylab = "Histogram", col = "lightblue") plot(density(df1$y), main = "Outcome (y)", xlab = "y", ylab = "Density", col = "darkgreen") # Is the following a 'good' regression model? lm.1 <- lm(y ~ x1 + x2 + x3 + x4, df1) par(mfrow=c(2,2)) plot(lm.1) summary(lm.1) # Are the assumptions met (they appear to be, based on the tests below)? library(gvlma) global.test <- gvlma(lm.1) summary(global.test) detach("package:gvlma") rm(global.test) # Testing for nonlinearity... small (significant) p-values indicate nonlinearity. library(lmtest) resettest(y ~ x1 + x2 + x3 + x4, power = 2:3, type = "regressor", data = df1) detach("package:lmtest") detach("package:zoo") # Are you sure this is a 'good' model?... library(car) par(mfrow=c(1,1)) scatterplotMatrix(df1) cor(df1) detach("package:car") detach("package:MASS") detach("package:nnet") detach("package:survival") detach("package:splines") ################################################################################ # The script below shows how the data was made. Z <- function(x){(1/sqrt(2*pi))*(exp(x^2/-2))} n <- 1000 x1 <- seq(-3.5, 3.5, length = n) x2 <- 10*Z(x1) x3 <- (x2 - mean(x2))/sd(x2) x3 <- ((x2 * 1.5) + 10) + rnorm(n) x4 <- rnorm(n, 10, 1.5) e <- rnorm(n, 0, 1.5) y <- 1.5 + .8*x1 + .6*x2 + .4*x3 + e x1 <- x1 + 10 x2 <- x2 + 8 x3 <- x3 - 3 y <- y + 4 df2 <- data.frame(y,x1,x2,x3,x4) rm(n, Z, x1, x2, x3, x4, e, y) summary(df2) rm(df1,df2,lm.1) # End; Sep. 23, 2011.