#
#
############### Basic and Robust Linear Regression ###############
#
# This script assumes you have worked through all the previous notes from
# the web page and you have downloaded, installed, and updated all available
# R packages.
# Load the following libraries if you have not already; when an additional
# library is needed, it will be listed in the script and loaded.
library(foreign)
library(Rcmdr)
# Start by using the 'foreign' library to import 'Example Data 5.sav' (available
# from the web page) assigning the name 'example5'.
example5 <- read.spss("http://www.unt.edu/rss/class/Jon/R_SC/Module3/ExampleData5.sav",
use.value.labels=TRUE, max.value.labels=Inf, to.data.frame=TRUE)
attach(example5)
###### Basic Linear Regression ######
# Gather the variables of interest.
subset.1 <- data.frame(apt,prison,age,peyrs,bt)
summary(subset.1)
# "Omit" the missing values.
subset.2 <- data.frame(na.omit(subset.1))
summary(subset.2)
detach(example5)
rm(subset.1)
attach(subset.2)
nrow(subset.2)
# Standard linear regression (returns un-standardized or raw coefficients).
reg1 <- lm(apt~prison+age+peyrs+bt)
summary(reg1)
# Standardized coefficients (using QuantPsyc library & lm.beta function).
library(QuantPsyc)
lm.beta(reg1)
# Plot.
scatterplotMatrix(~apt+prison+age+peyrs+bt,
reg.line=lm, smooth=TRUE, span=0.5,
diagonal = 'density', data=subset.2)
# Influence plot.
influencePlot(reg1)
# Confidence intervals for the coefficients.
confint(reg1, level=.95)
# Get the confidence interval for the Adj. R-square; taken from the output of
# summary(reg1) above (N=number of rows/cases; p=number of predictors in the
# model).
# Load the required library.
library(MBESS)
ci.R2(R2=.1286, conf.level=.95, N=726, p=4)
### Some diagnostic plots.
oldpar <- par(oma=c(0,0,3,0), mfrow=c(2,2))
plot(reg1)
par(oldpar)
#### Mahalanobis distance.
## Robust Mahalanobis distance.
# Using library(chemometrics) and Moutlier function; note: there are several dependent packages.
library(chemometrics)
# Get the robust Mahalanobis
# distances for each case of our current data; the plot contains both robust
# and classical Mahalanobis distances.
r.md <- Moutlier(subset.2,quantile=0.975,plot=TRUE)$rd
summary(r.md)
# Q to Q plot of the robust Mahalanobis distance (used to look for outliers).
par(mfrow=c(1,1))
qqPlot(r.md, dist= "norm", line="robust")
legend("topleft", "Robust Mahalanobis Distances", pch=1, title="Quantile-Quantile Plot", col="red")
# If desired; you can get the classic Mahalanobis distances for each case
# of our current data.
c.md <- Moutlier(subset.2,quantile=0.975,plot=FALSE)$md
summary(c.md)
par(mfrow=c(1,1))
qqPlot(c.md, dist= "norm", line="robust")
legend("topleft", "Classic Mahalanobis Distances", pch=1, title="Quantile-Quantile Plot", col="red")
par(oldpar)
## Comparison of Classic & Robust Mahalanobis Distances.
scatterplot(r.md, c.md, smooth=TRUE, ellipse=TRUE, span =1,
reg.line=lm, boxplots='xy')
# Cases with extreme Mahalanobis distances (upper right) are likely to be
# true multivariate outliers.
###### Robust regression using 'Robust' library/package. ######
library(robust)
# Robust Regression.
rob.reg <- lmRob(apt~prison+age+peyrs+bt, subset.2)
summary(rob.reg)
# The 'Test for Bias' output is a diagnostic for the fitting procedure and
# is not direct summary output for the fitted model.
# Get the confidence interval for the R-square.
ci.R2(R2=.12996, conf.level=.95, N=726, p=4)
# In the console, you will have to make number selections using the number
# keys and then hit enter to get each graph;
# THEN, you will have to hit the zero key or the 'escape' key to exit the
# graphs dialogue.
plot(rob.reg)
###### Relative Importance of Predictors to the Outcome in a model. ######
# Due to issues of multicolinearity (the so-called 'bouncing beta' problem); it
# is often desirable to use the relative importance functions to gain a better
# understanding of how each predictor contributes to the outcome variable in a
# regression model (uses the 'relaimpo' library).
library(relaimpo)
# Calculate the relative importance of the 4 predictors; larger 'lmg' is
# better (meaning more important).
calc.relimp(reg1)
# Conduct the bootstrapped relative importance function (b=number of bootstrapped
# resamples).
boot.reg1 <- boot.relimp(subset.2, b = 200, type = c("lmg"),
rank = TRUE, diff = TRUE, rela = TRUE)
booteval.relimp(boot.reg1)
plot(booteval.relimp(boot.reg1))
# Conduct the bootstrapped relative importance function; this time using
# ALL possible relative importance indices (lmg, last, first, pratt).
boot.reg1.2<-boot.relimp(subset.2,b = 200,type = c("lmg","last","first", "pratt"),
rank = TRUE, diff = TRUE, rela = TRUE)
booteval.relimp(boot.reg1.2)
plot(booteval.relimp(boot.reg1.2))
###### Best Subsets Regression ######
# Another alternative (also based on relative importance) would be the Best
# Possible Subsets function (leaps library), which calculates the best
# predictor(s) for a given model with 1, 2, or 3 predictors (out of the 4).
# Best possible subsets; with NO MORE than 3 predictors (of the 4) retained;
# the asterisk in the output denotes which variable qualifies as best in a
# given model with 1 predictor, 2 predictors, or 3 predictors.
library(leaps)
reg2 <- regsubsets(apt~prison+age+peyrs+bt, subset.2, intercept=T, nvmax=3)
summary(reg2)
# This time; isolating the best 2 predictors out of the 4 available.
reg3 <- regsubsets(apt~prison+age+peyrs+bt, subset.2, intercept=T, nvmax=2)
summary(reg3)
###### Direct comparison of two fitted models. ######
# Comparison of two models in terms of R-square change; standard regression.
# Sometimes called sequential or hierarchical regression; often used to
# evaluate co-variates or different predictors (not to be confused with
# step-wise regression).
RegMod.1 <- lm(apt~prison+age+peyrs)
summary(RegMod.1)
RegMod.2 <- lm(apt~prison+age+peyrs+bt)
summary(RegMod.2)
anova(RegMod.1,RegMod.2)
# The anova is testing for R-square change from model 1 to model 2 (R-square
# change different from zero?). Here; the F-value is relatively small (and the
# p-value is larger than .05) so we could safely say that R-square change is
# not significantly different from zero.
# However; we can calculate more useful (and less biased) statistics for
# comparing fitted models. In both cases (AIC & BIC) smaller values indicate
# better fit.
# The Akaike Information Criterion (AIC).
AIC(RegMod.1)
AIC(RegMod.2)
# The Bayes Information Criterion (BIC)
AIC(RegMod.1, k = log(nrow(subset.2)))
AIC(RegMod.2, k = log(nrow(subset.2)))
## See also: http://www.unt.edu/benchmarks/archives/2008/june08/rss.htm
# End: December 2010.