# # ############### Basic and Robust Correlation ############### # # This script assumes you have worked through all the previous notes from # the web page and you have downloaded, installed, and updated all available # R packages. # Load the following libraries if you have not already; when an additional # library is needed, it will be listed in the script and loaded. library(foreign) # Start by using the 'foreign' library to import 'Example Data 5.sav' (available # from the web page) assigning the name 'example5'. example5 <- read.spss("http://www.unt.edu/rss/class/Jon/R_SC/Module3/ExampleData5.sav", use.value.labels=TRUE, max.value.labels=Inf, to.data.frame=TRUE) attach(example5) nrow(example5) names(example5) ############### Two Variables ################ plot(prison,apt) # Assign those two variables to a new data frame, and then remove the # missing values. apt.prison <- data.frame(apt,prison) a.p <- na.omit(apt.prison) detach(example5) attach(a.p) rm(apt.prison) # Quantiles of apt with respect to quantiles of prison. qqplot(prison,apt) ###### Different types of correlations (Two Variables). ###### pear.cor <- cor(prison, apt, use = "pairwise", method = "pearson") pear.cor ken.cor <- cor(prison, apt, use = "pairwise", method = "kendall") ken.cor spear.cor <- cor(prison, apt, use = "pairwise", method = "spearman") spear.cor # Simple correlation significance test (default method = Pearson). cor.test(apt,prison) # OR specify method: cor.test(apt,prison, method="spearman") cor.test(apt,prison, method="kendall") # Robust Percentage Bend (pb) correlation; available in the Wilcox Robust Statistics (WRS) library. # Keep in mind, the WRS library/package is not available on CRAN, you must get it from # R-Forge at the following location: https://r-forge.r-project.org/R/?group_id=468 # To install this package directly within R type: install.packages("WRS", repos="http://R-Forge.R-project.org") library(WRS) pbcor(apt,prison) # Take a look with relplot. relplot(apt,prison) ## Robust correlations Minimum Volume Ellipsoid (MVE) & ## Minimum Covariance Determinant (MCD). ## The 'cov.rob' function is located in the ## MASS library (required by the Rcmdr library, above). # Run the 'cov.rob' function, specifying the method as MVE. # Note, the 'set.seed' function simply ensures we get consistent results. set.seed(123) cov.rob(a.p, method="mve") # to Extract just the correlation matrix: cov.rob(a.p, cor=TRUE, method="mve")$cor # Next, run the 'cov.rob' function, specifying the method as MCD. cov.rob(a.p, cor=TRUE, method="mcd")$cor # More useful scatterplot with linear least squares regression line # and smoothers, as well as x and y axis boxplots. library(car) scatterplot(apt, prison, smooth=TRUE, ellipse=TRUE, span =.5, reg.line=lm, boxplots='xy', data=a.p) ############################################################################# # Some Clarification of Robust correlations; Minimum Volume Ellipsoid (MVE) # # and Minimum Covariance Determinant (MCD) Correlations. # # # # “These measures search for the central half of the data and then use this # # half of the data to estimate location and scatter. As is evident, the # # covariance associated with this central half of the data readily yields a # # correlation coefficient. For example, simply compute Pearson’s # # correlation based on the central half of the data” (Wilcox, 2005, p. 404).# # # # Clarification of Percentage Bend Correlations (pbcor). # # “Rather, the goal is to estimate a measure of correlation, rpb, that is # # not overly sensitive to slight changes in the distributions” (Wilcox, # # 2005, p. 391). # # # # Essentially, the percentage bend is trimming each variable of its # # univariate outliers, then conducting the correlation, while the MVE is # # eliminating bivariate outliers; using only the ‘tightest’ 50% of the data # # to conduct the correlation. # # # # Wilcox, R. (2005). Introduction to Robust Estimation and Hypothesis # # Testing. San Diego: Academic Press, Second edition. # ############################################################################# detach(a.p) rm(a.p) ###### Different types of correlations (Matrix of variables). ####### # Simple scatter plot matrix. pairs(~apt+prison+age+peyrs+bt, example5) # Gather the variables of interest. attach(example5) model.1 <- data.frame(apt,prison,age,peyrs,bt) head(model.1) detach(example5) # Simple scatter plot matrix (using the entire data.frame). pairs(model.1) # Correlation matrix. cor(model.1) # *Notice how it returns 'NA' for each correlation because, we have missing # data on each variable. # "Omit" the missing values. model.2 <- data.frame(na.omit(model.1)) model.2 attach(model.2) rm(model.1) # Default correlations are Pearson method. cor(model.2) #correlation matrix abs(cor(model.2)) #absolute value correlation matrix (all positive) round(cor(model.2), 3) #rounding off to the third digit # Other methods of correlation: Spearman, Kendall. cor(model.2, method="spearman") cor(model.2, method="kendall") # Covariance cov(model.2) ## Complex Scatter matrix w/correlations in the upper panel where the font ## size of each correlation corresponds to the magnitude of the correlation. # **Note: all you would need to change to apply this to a new data frame is # to change the name of the data frame: listed as 'model.2' in the 'pairs' # function line (last line of code). The 'panel.cor' object is assigned a # slightly complex function for setting up our plot design; the 'pairs' # function actually completes the plot. panel.cor <- function(x, y, digits=3, prefix="", cex.cor, ...) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- abs(cor(x,y)) txt <- format(c(r, 0.123456789), digits=digits)[1] txt <- paste(prefix, txt, sep="") if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex.cor) } pairs(model.2, lower.panel=panel.smooth, upper.panel=panel.cor) ##### Robust correlations using Wilcox library set.seed(123) mve.all <- cov.mve(model.2, cor=T) mve.all set.seed(123) mcd.all <- cov.mcd(model.2, cor=T) mcd.all # Use the naming convention '$' to identify pieces of the output above, and # then get just the correlations from that output. names(mve.all) names(mcd.all) mve.all$cor mcd.all$cor ############ Canonical Correlation ############ names(model.2) subset.x <- data.frame(apt, prison) subset.y <- data.frame(age, peyrs, bt) cancor(subset.x, subset.y) # Canonical Correlation Analysis. library(yacca) library(yhat) library(MBESS) canon <- cca(subset.x, subset.y) canon canons <- cca(subset.x, subset.y, xcenter = TRUE, ycenter = TRUE, xscale = TRUE, yscale = TRUE, standardize.scores = TRUE) canons canon.lambda <- exp(-canon$chisq/(nrow(subset.x)-1-.5*(ncol(subset.x)+ ncol(subset.y)+1))) canon.lambda # You can see the names of all the output objects by using the 'names' function. # Then you can access those objects (parts) by using the '$' operator. names(canon) canon$corr canon$corrsq canon$xcoef # Canonical Communality Analysis. canonCommonData <- canonCommonality(subset.x, subset.y, 1) canonCommonData # END: Minor updates, Apr. 23, 2013.