###### Multiple Regression Simulation # # Demonstrates Bias in R squared and the use of boostrap validation to # adjust R squared. Also demonstrates the use of recalculating the p-value # for the adjusted R-squared. library(MASS) library(Design) library(Hmisc) rho.pop<-.20 sample.size<-50 # Set up the correlation matrix between the 10 predictor # variables and the 1 outcome variable There is no correlation # between the predictor variables, but a value of "rho.pop" # between each predictor variable and the outcome variable Sigma<-matrix(c(1,rho.pop,rho.pop,rho.pop,rho.pop,rho.pop,rho.pop,rho.pop,rho.pop,rho.pop, rho.pop, rho.pop, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, rho.pop, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, rho.pop, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, rho.pop, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, rho.pop, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, rho.pop, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, rho.pop, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, rho.pop, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, rho.pop, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, rho.pop, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1), ncol=11, byrow=T) # Examine the POPULATION correlation matrix Sigma # Generate simulated data from the pop. correlation matrix with a mean of 0 # and standard devaiation of 1. Then convert the matrix to a data.frame # called "data.r" data.r<-data.frame(mvrnorm(sample.size, mu=rep(0,11), Sigma, empirical=F)) # Attach the data.frame and examine the first few cases attach(data.r) head(data.r) # Calculate the sample Pearson correlation values for all pairs of variables cor(data.r) # Fit a classical regression model with X1 as the outcome and X1 to X11 as the predictors # Use the "scale" function to convert raw scores to z socres before the regression # analysis data.r.fit<-ols(scale(X1)~scale(X2)+scale(X3)+scale(X4)+scale(X5)+scale(X6)+ scale(X7)+scale(X8)+scale(X9)+scale(X10)+scale(X11), x=TRUE, y=TRUE) data.r.fit anova(data.r.fit) # Look at the names of the parts of the output and extract the R squared value names(data.r.fit) data.r.fit$stats data.r.fit$stats[4] # Use bootstrap validation to get an estimate of how the original observed R squared # (index.original) is optimistic: 1) Resample from the orignal data and produce a # bootstrap regression model and a bootstrap R squared value - repeat this process averaged # over n bootstrap samples (training); 2) For a bootstrap sample, apply the unchanged bootstrap # regression model to the original data set - repeat this process averaged over n bootstrap # samples (test); 3) Subtract the average 'test' R squared value from the average 'training' # R squared value to produce the average OPTIMISM in the original R squared value (index.orig). # The value is the 'index.corrected' R squared value. It represents the average accuracy in # prediction after accounting for the optimism in the orignial observed R squared value # (index.original). data.r.validate<-validate(data.r.fit, B=1000) data.r.validate # Calibrate the final model data.r.calibrate<-calibrate(data.r.fit) data.r.calibrate # Plot the calibrated model plot(data.r.calibrate) ############################################################################## # # This section demonstrates Cohen's effect sizes for regression and anova # and how to calculate a new probability value for the bootstrap bias adjusted # R-squared value # Note that F(df.num, df.denom) = cohen.f.squared * (N-1-num.pred)/num.pred) # So, using the bootstrap adjusted R-squared: r.squared.boot.adj<-.25 r.squared.boot.adj f.squared<-r.squared.boot.adj/(1-r.squared.boot.adj) f.squared # Using the bias adjusted R squared value, we calculate the new F ratio # then get a new bias adjusted p-value: F.value.adj<-f.squared * (39/10) F.value.adj # Calculate the p-value for the new F value assuming Ho is true: pf(F.value.adj, 10, 39, ncp=0, lower.tail = TRUE, log.p = FALSE)