#
######### TOO MANY PREDICTORS = BAD ############
#
# If you include a large number of predictors in a Linear (regression) Model (lm), there is a very good chance
# at least one of the predictors will be significant even when all predictors are in fact random (i.e. not
# related to the outcome). This is why variable reduction techniques (e.g. Principal Components Analysis)
# and variable selection techniques (e.g. Bayesian Model Averaging) are so important.
# The script below has 10 iterations; each iteration creates a data frame with
# one outcome variable and 40 predictor variables. Notice all 41 variables
# are created with the 'rnorm' function -- which creates n = 100 random
# normal (normally distributed) scores with a mean of 10 and a standard
# deviation of 1.5 -- which means, all 41 variables are RANDOM and not at all
# related to one another. Then, for each iteration/data frame, a linear (OLS)
# regression model is fit and the summary of each model is saved into the
# 'results' list object. Once the script is done running, you can review
# the results of each regression by looking at each element of the 'results'
# list object. Notice how many of the regression results contain at least
# one significant predictor...by chance.
results <- as.list(0)
for (i in 1:10){
n <- 100
df.1 <- data.frame(0)
for (k in 1:41){df.1[1:n, k] <- rnorm(n, 10, 1.5)}
lm.1 <- lm(X0 ~ V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 +
V11 + V12 + V13 + V14 + V15 + V16 + V17 + V18 + V19 + V20 +
V21 + V22 + V23 + V24 + V25 + V26 + V27 + V28 + V29 + V30 +
V31 + V32 + V33 + V34 + V35 + V36 + V37 + V38 + V39 + V40 +
V41, df.1)
results[[i]] <- summary(lm.1)
}
results
# End: Mar. 2012