# # ########## A VERY Useful Package for Working with Linear Models ########## # # The 'lmSupport' package written by John Curtin and released 15 Feb., 2011. ####################################################################### # Read in an example data file (this data is fictitious). dataset <- read.table("http://www.unt.edu/rss/class/Jon/R_SC/Module6/IntroPsych_Spring2010.txt", header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE) summary(dataset) nrow(dataset) # Attach the data so that we can reference variables by name directly instead of using the '$' operator (e.g. # dataset$age). attach(dataset) ####################################################################### # Load the 'lmSupport' library, note there are several dependent packages. library(lmSupport) ####################################################################### # The 'lm.describeData' function "provides three levels of detail regarding descriptive statistics for a # data frame" (Curtin, 2011, p. 7). subset <- data.frame(neuroticism, extroversion, openness) detach(dataset) lm.describeData(subset, detail = 1) lm.describeData(subset, detail = 2) # the default lm.describeData(subset, detail = 3) rm(subset) attach(dataset) ####################################################################### # The 'lm.describeGroups' function works the same way as the above, 'lm.describeData' function; but # 'lm.describeGroups' lists descriptive statistics for factor levels on a specified outcome variable. lm.describeGroups(number_grade, list(sex)) lm.describeGroups(number_grade, list(sex, class_standing)) ####################################################################### # The 'lm.mergeData' function allows two data frames to be combined (adding columns/variables) by specifying a # reference variable (column name or number) in each data frame. subset.1 <- data.frame(ID, age, sex) head(subset.1) subset.2 <- data.frame(ID, hardworker, number_grade) head(subset.2) superset <- lm.mergeData(subset.1, subset.2, by.x = "ID", by.y = "ID") head(superset) rm(subset.1, subset.2, superset) ####################################################################### # The 'lm.boxCox' function "calculates and plots the log-likelihoods lambda for power (Box, Cox) transfomation of # the response (outcome) variable" (Curtin, 2011, p. 2). # Box and Cox power transformations are appropriate when the outcome variable is extremely skewed. # For more information, see: Box and Cox (1964). # The function 'lm.boxCox' requires all values of the outcome variable to be greater than zero. Therefore, in this # example; we add '1' to the outcome variable (absences) to create a model on which the transformation can be # applied. hist(absences, col = "lightgreen") model.1 <- lm(absences + 1 ~ neuroticism + extroversion) lm.boxCox(model.1, lambdas = seq(-2, 2, by = 0.1)) # Using the 'Best Lambda' estimate (0.06 for this example), we can apply and illustrate the Box & Cox transformation. bC.absences <- (((absences + 1)^0.06) - 1) / 0.06 hist(bC.absences, col = "lightgreen") rm(bC.absences) ####################################################################### # The 'lm.modelAssumptions' function "provides diagnostic graphs and score tests to evaluate linear model # assumptions of normality, constant variance, and linearity" (Curtin, 2011, p. 11). lm.modelAssumptions(model.1, 'NORMAL') # Note that this one (below) returns a suggested power transformation lambda statistic. lm.modelAssumptions(model.1, 'CONSTANT') # This one (below) requires clicking on the graphics window. lm.modelAssumptions(model.1, 'LINEAR') ####################################################################### # The 'lm.caseAnalysis' function "provides diagnostic graphs and visual cut points for identification of # points that are univariate outliers, high leverage, regression outliers, and/or influential" (Curtin, 2011, p.3) # When the folling functions are used, the 'identify' ability is enabled. Meaning, you can use your mouse to click # on the blue points located to the right of the vertical red line, along the x-axis; this allows you to identify # cases which create extreme outliers. Activate (click on) the R console and then hit the 'Esc' key to exit the # identify ability. lm.caseAnalysis(model.1, 'RESIDUALS') lm.caseAnalysis(model.1, 'UNIVARIATE') lm.caseAnalysis(model.1, 'DFBETAS') ####################################################################### # The 'lm.pointEstimates' function "provides point estimates and SEs [standard errors] from a linear model (Curtin, # 2011, p. 12). It also returns lower (lwr) and upper (upr) bounds of a 95% confidence interval for the predicted # values of the outcome (based on the supplied model). lm.pointEstimates(model.1, dataset) ####################################################################### # The 'lm.correctSE' function "calculates heteroscedasticity-corrected SEs (standard errors) and associated tests # for regression coefficients based on [the] method described by White (1980) using 'hccm' function from the 'car' # package (Curtin, 2011, p. 5). In the example below, you can tell our model (model.1) does not have much # heterscedasticity; because, the standard errors change very little when corrected using White's method. lm.correctSE(model.1) ####################################################################### # The 'lm.sumSquares' function "Calculates unique SSRs [sums of squares residual], SSE [sums of squares effect], # SST [sums of squares total]. Based on these SSs, it calculates sd2 (aka eta2 [eta squared], delta R2 [delta R- # squared]), and pr2 (aka partial eta2) for all effects in a linear model object. For categorical variables, it # calculates these for multi-df effect. Uses the 'anova' function from the 'car' package with Type 3 error" (Curtin, # 2011, p. 17). lm.sumSquares(model.1) rm(model.1) ####################################################################### # The 'lm.setContrasts' function "calculates a contrast matrix for a specified contrast type" (Curtin, 2011, p. 15). # There are four available contrast types (for later use coding factor levels as numeric variables): DUMMY, EFFECTS, # POC (Planned Orthogonal Contrasts), and HELMERT. subset <- data.frame(sex, family_income, class_standing) detach(dataset) contrasts(subset$sex) <- lm.setContrasts(subset$sex, Type = 'DUMMY', RefLevel = 1) contrasts(subset$family_income) <- lm.setContrasts(subset$family_income, Type = 'EFFECTS', RefLevel = 2) contrasts(subset$class_standing) <- lm.setContrasts(subset$class_standing, Type = 'POC', POCList = list(c(-1,-1,-1,3), c(0,-1,-1,2))) ####################################################################### # The 'lm.codeRegressors' function "adds new variables/columns in a dataframe to represent numeric regressors # for a factor" (Curtin, 2011, p. 4). Essentially, this function takes the "contrasts" information from the # previously discussed 'lm.setContrasts' function and creates the extra variables for coding levels # of a factor (e.g. dummy variables, effect variables, etc.). head(subset) subset <- lm.codeRegressors(subset, 'sex') head(subset) subset <- lm.codeRegressors(subset, 'family_income') head(subset) subset <- lm.codeRegressors(subset, 'class_standing') head(subset) rm(subset) attach(dataset) ####################################################################### # The 'lm.deltaR2' function "calculates [an] F-test to compare two models to determine if model 2 R2 [R-squared] # is significantly greater than model 1 R2 [R-squared]" (Curtin, 2011, p. 6). Note, model 1 should contain a # subset of model 2's predictors (the first model should always have fewer predictors). model.1 <- lm(number_grade ~ drinks_week + absences + IQ + confidence + hardworker, data = dataset) model.2 <- lm(number_grade ~ drinks_week + absences + IQ + confidence + hardworker + neuroticism + extroversion + openness + agreeableness + conscientiousness, data = dataset) lm.deltaR2(model.1, model.2) # Similar to the old faithful: anova(model.1, model.2) rm(model.1, model.2) ############################# References ############################## # Box, G., & Cox, D. (1964). An analysis of transformations (with discussion). Journal # of the Royal Statistical Society, 26, 211 - 252. [Available in Adobe.pdf from JSTOR]. # Curtin, J. J. (2010). Package 'lmSupport' (manual). Available at CRAN: # http://cran.r-project.org/web/packages/lmSupport/lmSupport.pdf # White, H. (1980). A heteroskedastic consistent covariance matrix estimator and a direct test of # heteroskedasticity. Econometrica, 48, 817 - 838. [Available in Adobe.pdf through JSTOR]. # # END: Feb. 2011.