# # #### Illustration of sampling bias and model specification error (error of omission). # # Load the necessary library. library(QuantPsyc) # First, create the population model of attributes' TRUE scores (N = 50000). N <- 50000 X1 <- rnorm(N, 0, 1) X2 <- rnorm(N, 0, 1) X3 <- rnorm(N, 0, 1) X4 <- rnorm(N, 0, 1) Y <- 0.9*X1 + 0.6*X2 + 0.3*X3 + .8*X4 # Modeling the outcome as a function of the predictors. ID <- seq(1:N) # Simply a case identification vector. # Combine to create the population data frame. population.df <- data.frame(ID,Y,X1,X2,X3,X4) rm(N,ID,Y,X1,X2,X3,X4) # Verify the population model. pop.model <- lm(Y ~ 0 + X1 + X2 + X3 + X4, population.df) summary(pop.model) # Next, randomly sample (n = 50) individuals from the populaton. samp.id <- sample(population.df[,1], 50, replace = FALSE) # Next, select those individuals' data from the population to create a sample data frame. # Note, here we leave out the unnecessary "ID" variable and, to create model # specification error, we leave out the "X4" variable. sample.df <- population.df[samp.id,2:5] head(sample.df) # Next, re-name the sample variables to lower case to indicate they are sample variables. names(sample.df) <- c("y","x1","x2","x3") # Next, (imperfectly) measure the sample individuals' attributes (here done with re-scaling). # Any Mean (here = 100) and Standard Devaition (here = 15) can be used to reflect # the mean and standard deviation of the measures you are going to use in your study. sample.df$y <- (sample.df$y - mean(sample.df$y))/sd(sample.df$y) sample.df$y <- (sample.df$y *15) + 100 sample.df$x1 <- (sample.df$x1*15) + 100 sample.df$x2 <- (sample.df$x2*15) + 100 sample.df$x3 <- (sample.df$x3*15) + 100 summary(sample.df) # Next, investigate the sample model. Note the bias or difference between it and the # population model (pop.model) from above. samp.model <- lm(y ~ x1 + x2 + x3, sample.df) summary(samp.model) lm.beta(samp.model) ls() rm(sample.df,samp.id,samp.model) ls() ################################################################################ # To get an idea of how sample estimates vary; do some REsampling. samp.results <- data.frame(matrix(0, ncol = 8)) names(samp.results) <- c("Intercept","x1.coef","x2.coef","x3.coef","R^2","x1.beta","x2.beta","x3.beta") for(i in 1:100){ samp.id <- sample(population.df[,1], 50, replace = FALSE) sample.df <- population.df[samp.id,2:5] names(sample.df) <- c("y","x1","x2","x3") sample.df$y <- (sample.df$y - mean(sample.df$y))/sd(sample.df$y) sample.df$y <- (sample.df$y *15) + 100 sample.df$x1 <- (sample.df$x1*15) + 100 sample.df$x2 <- (sample.df$x2*15) + 100 sample.df$x3 <- (sample.df$x3*15) + 100 samp.model <- lm(y ~ x1 + x2 + x3, sample.df) samp.results[i,1:4] <- samp.model$coef samp.results[i,5] <- summary(samp.model)$r.squared samp.results[i,6:8] <- lm.beta(samp.model) rm(sample.df,samp.id,samp.model) } rm(i) summary(samp.results) ls() detach("package:QuantPsyc") rm(samp.results) ls() # End.