# # ########## Partial Least Squares Examples. ########## # # Import the (simulated) data. pls.data <- read.table("http://www.unt.edu/rss/class/Jon/R_SC/Module8/PLSdata001.txt", header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE) # Calculate covariance matrix of the 20 variables. cov.m <- cov(pls.data[,4:23]) cov.m head(pls.data) round(cor(pls.data[,3:23]), 3) ################################################################################## # Check the relationships using traditional SEM. library(sem) measurement.model <- specify.model() F1 -> v1, lam11, NA F1 -> v2, lam12, NA F2 -> v3, lam21, NA F2 -> v4, lam22, NA F2 -> v5, lam23, NA F3 -> v6, lam31, NA F3 -> v7, lam32, NA F3 -> v8, lam33, NA F3 -> v9, lam34, NA F3 -> v10, lam35, NA F3 -> v11, lam36, NA F4 -> v12, lam41, NA F4 -> v13, lam42, NA F4 -> v14, lam43, NA F4 -> v15, lam44, NA F5 -> v16, lam51, NA F5 -> v17, lam52, NA F5 -> v18, lam53, NA F5 -> v19, lam54, NA F5 -> v20, lam55, NA v1 <-> v1, var1, NA v2 <-> v2, var2, NA v3 <-> v3, var3, NA v4 <-> v4, var4, NA v5 <-> v5, var5, NA v6 <-> v6, var6, NA v7 <-> v7, var7, NA v8 <-> v8, var8, NA v9 <-> v9, var9, NA v10 <-> v10, var10, NA v11 <-> v11, var11, NA v12 <-> v12, var12, NA v13 <-> v13, var13, NA v14 <-> v14, var14, NA v15 <-> v15, var15, NA v16 <-> v16, var16, NA v17 <-> v17, var17, NA v18 <-> v18, var18, NA v19 <-> v19, var19, NA v20 <-> v20, var20, NA F1 <-> F2, cov1, NA F1 <-> F3, cov2, NA F1 <-> F4, cov3, NA F1 <-> F5, cov4, NA F2 <-> F3, cov5, NA F2 <-> F4, cov6, NA F2 <-> F5, cov7, NA F3 <-> F4, cov8, NA F3 <-> F5, cov9, NA F4 <-> F5, cov10, NA F1 <-> F1, NA, 1 F2 <-> F2, NA, 1 F3 <-> F3, NA, 1 F4 <-> F4, NA, 1 F5 <-> F5, NA, 1 # Unfortunately, the model as specified does not converge. sem.model.1 <- sem(measurement.model, cov.m, 1000, maxiter = 10000) summary(sem.model.1, conf.level=0.95) # It still does not converge after making some adjustments to the arguments. sem.model.2 <- sem(measurement.model, cov.m, 1000, maxiter = 10000, analytic.gradient = FALSE, par.size = c("ones")) summary(sem.model.2, conf.level=0.95) # Cleaning up the workspace. ls() rm(sem.model.1, sem.model.2, measurement.model, cov.m) ls() detach("package:sem") ######################################### # ## Partial Least Squares (PLS) modeling is often an alternative to traditional ## modeling techniques. Unlike traditional modeling techniques which rely upon ## covariance decomposition, PLS is a variance based (or components based) ## technique and does not carry with it many of the assumptions of covariance ## methods (i.e. distributional assumptions). ## It is sometimes considered an analysis of last resort because large samples ## are not as necessary with it, and PLS is less sensitive to multicollinearity. ## PLS is also quite versitile; it can be used as a regression technique, ## a principal components technique, a canonical correlation technique, or a path ## modeling (or structural equation modeling) technique. However, it is well ## documented that PLS is biased because the optimization is local rather than ## global level. As sample size increases PLS becomes less bias; however, PLS is ## often used when other methods fail (i.e. a slightly biased estimate is better ## than no estimate). # Package 'plspm' (Partial Least Squares Path Modeling). library(plspm) ###### # PLS regression: Using PLS with one outcome variable (i.e. dependent variable). # First, create matrix or data frame of the predictors. pred <- pls.data[,4:8] # Here we are using a composite or summary score as the dependent variable. attach(pls.data) f3 <- v6 + v7 + v8 + v9 + v10 + v11 detach(pls.data) # Run the PLS regression; nc = number of extracted PLS components (default is 2), # cv = whether cross-validation should be performed (default is FALSE). pls.reg <- plsreg1(x = pred, y = f3, nc = 2, cv = FALSE) pls.reg # Plot of the results (after reviewing, you may want to close all the graphics windows). plot(pls.reg) # Standardized coefficients, un-standardized coefficients, and R-squared. pls.reg$std.coef pls.reg$coeffs pls.reg$R2 # Histogram of the residuals. hist(pls.reg$resid) # Cleaning up the workspace. ls() rm(f3, pls.reg, exprs) ls() ##### # PLS regression WITH multiple outcome variables (aka. dependent variables). # This is very similar to canonical correlation analysis, except; here we # are postulating that X causes or predicts Y. In canonical correlation # analysis there is only a 'relationship' being specified (i.e. non-directional). # Here we will reuse the 'pred' object created above (for our predictors). # And we will use only two variables (v6 & v7) as our outcome variables. outco <- pls.data[,9:10] # Run the PLS multiple outcome regression. pls.mreg <- plsreg2(X = pred, Y = outco, nc = 2) pls.mreg # Plot of the results (after reviewing, you may want to close all the graphics windows). plot(pls.mreg) pls.mreg$std.coef pls.mreg$coeffs pls.mreg$Q2 hist(pls.mreg$resid) # Cleaning the workspace. ls() rm(pred, outco, pls.mreg) ls() ##### # Non-linear Iterative Partial Least Squares; performs a principal components # analysis (PCA) with NIPALS algorithm (nc = number of components). nipals.1 <- nipals(pls.data[,4:23], nc = 5, scaled = TRUE) nipals.1 nipals.1$values nipals.1$scores nipals.1$loadings nipals.1$cor.sco # Plot of the results (after reviewing, you may want to close all the graphics windows). plot.nipals(nipals.1) # Cleaning the workspace. ls() rm(nipals.1) ls() ##### # PLS-CA: Partial Least Squares Canonical Analysis. # Performs partial least squares canonical analysis for TWO blocks of data. Compared # to PLSR2, the blocks of variables in PLS-CA play a symmetrical role (i.e. there # is neither predictors or predictands). subset.x <- data.frame(pls.data[,4:8]) subset.y <- data.frame(pls.data[,9:14]) plsca.1 <- plsca(X = subset.x, Y = subset.y, nc = NULL, scaled = TRUE) plsca.1 # Results can be retrieved using some simple commands; keep in mind, the # "X" variate is referred to as 't' and the "Y" variate is referred to as 'u'. # The correlations between each 'x' variable and 't'. plsca.1$cor.xt # The correlations between each 'y' variable and 'u'. plsca.1$cor.yu # The Canonical Correlation (between 't' and 'u' for each canonical solution). plsca.1$cor.tu # The explained variance of 'x' by 't'. plsca.1$R2X # The explained variance of 'y' by 't'. plsca.1$R2Y # Plot of the results (after reviewing, you may want to close all the graphics windows). plot(plsca.1) # Cleaning the workspace. ls() rm(plsca.1, subset.x, subset.y) ls() ##### # PLS Path Modeling. ### Gathering the necessary objects for the PLS Path Model. # First, create the matrix which expresses the inner (structural) model; this model # simply shows the relationships among the latent variables; where the column variable # 'causes' the row variable if a 'one' is in the intersecting cell (e.g. f1 and f2 # cause f3 --> columns 1 and 2 cause row 3). inner.matrix <- matrix(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0), 5, 5, byrow = TRUE) dimnames(inner.matrix) <- list(c("f1", "f2", "f3", "f4", "f5"), c("f1", "f2", "f3", "f4", "f5")) inner.matrix # Next, create the list which expresses the outer (measurement) model; this # model simply shows the relationships between the manifest variables and the # latent variables (e.g. variables v1 and v2 are related to the first factor [f1]). outer.list <- list(c(1,2), c(3,4,5), c(6,7,8,9,10,11), c(12,13,14,15), c(16,17,18,19,20)) outer.list # Next, create the vector which identifies what "mode" of indicators are used (i.e. "A" for # reflective measurement or "B" for formative measurement). Recall, 'Reflective' measurement # is said to occur when each manifest variable is "caused by" a latent variable and # 'Formative' measurement is said to occur when each manifest variable "causes" the # latent variable. Below, all 5 latent variables in our model are "reflectively" # measured (i.e. each latent causes the observed scores on the manifest variables). mode.vec <- c("A", "A", "A", "A", "A") # Finally, we can run the Partial Least Squares Path Model. pls.model.1 <- plspm(x = pls.data[,4:23], inner = inner.matrix, outer = outer.list, modes = mode.vec, scheme = "factor", scaled = TRUE, plsr = TRUE, tol = 0.00001, iter = 100) pls.model.1 # The 'summary' function provides a very thorough summary with labels. summary(pls.model.1) pls.model.1$loadings pls.model.1$path.coefs # You can even create a path diagram. plot(pls.model.1) # PLS path modeling WITH bootstrapped validation. pls.model.2 <- plspm(x = pls.data[,4:23], inner = inner.matrix, outer = outer.list, modes = mode.vec, scheme = "factor", scaled = TRUE, plsr = TRUE, boot.val = TRUE, br = 200, tol = 0.00001, iter = 100) pls.model.2 # When looking at the "BOOTSTRAP VALIDATION" part of the results summary, notice the # 'Original' values are the same as the 'Mean.Boot' values (which should be the case). # The 'Std.Error' (Standard Error) represents the bias associated with the estimates; # notice the confidence intervals for each bootstrapped estimate. summary(pls.model.2) ## # Detaching package 'plspm' and its dependencies. detach("package:plspm") detach("package:amap") detach("package:diagram") detach("package:shape") # Cleaning the workspace. ls() rm(inner.matrix, mode.vec, outer.list, pls.model.1, pls.model.2) ls() ################################################################################ # Moving on to the 'semPLS' package, which offers very similar functions for # conducting PLS path modeling. # First, create a data frame for the structural (also called inner) model. This # model simply shows the relationships among the latent variables. from <- c("f1", "f2", "f2", "f3", "f3", "f4") to <- c("f3", "f3", "f4", "f4", "f5", "f5") inner.mod <- data.frame(from, to) inner.mod # If desired, write the 'inner.mod' data frame out to the working directory as a comma separated # values (.csv) file. # write.table(inner.mod, "C:/Users/jds0282/Desktop/Workstuff/Jon_R/Example Data/inner.mod.csv", # sep=",", col.names=TRUE, row.names=FALSE, quote=TRUE, na="NA") # Remove the unnecessary objects from above. rm(from, to) # Next, create a data frame for the measurement (also called outer) model. This # model specifies the manifest variables relationships to the latent variables. from <- c("f1", "f1", "f2", "f2", "f2", "f3", "f3", "f3", "f3", "f3", "f3", "f4", "f4", "f4", "f4", "f5", "f5", "f5", "f5", "f5") to <- c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20") outer.mod <- data.frame(from, to) outer.mod # If desired, write the 'outer.mod' data frame out to the working directory as a comma separated # values (.csv) file. # write.table(outer.mod, "C:/Users/jds0282/Desktop/Workstuff/Jon_R/Example Data/outer.mod.csv", # sep=",", col.names=TRUE, row.names=FALSE, quote=TRUE, na="NA") # Clean up the workspace. rm(from, to) # Load the 'semPLS' library which contains several functions for conducting Partial # Least Squares analysis. library(semPLS) # Next, convert the two model data.frames into matrices; structural model and # measurement model. sm <- as.matrix(inner.mod) sm mm <- as.matrix(outer.mod) mm # Now we can create a single 'plsm' object which will be used to run the PLS path # model. plsm.obj <- plsm(data = pls.data[,4:23], strucmod = sm, measuremod = mm) plsm.obj # Now, we can take the 'plsm' object and submit it to the 'sempls' function to # actually run the Partial Least Squares Path Model (or later, the bootstrapped # version). pls.pathmod.1 <- sempls(plsm.obj, data = pls.data[,4:23], maxit = 20, tol = 1e-7, scaled = TRUE, sum1 = FALSE, E = "A", pairwise = FALSE, method = "pearson", convCrit = "relative") summary(pls.pathmod.1) names(pls.pathmod.1) pls.pathmod.1 densityplot(pls.pathmod.1) densityplot(pls.pathmod.1, use = "prediction") densityplot(pls.pathmod.1, use = "residuals") pls.pathmod.1$outer_weights pls.pathmod.1$outer_loading pls.pathmod.1$path_coefficients pls.pathmod.1$total_effects rSquared(pls.pathmod.1) # The 'outer_loadings' from above are virtually the same as was produced with # the 'plspm' function of the 'plspm' package above. ##### # Bootstrapped PLS using an object from the 'sempls' function (as directly above). # The default number of bootstrapped samples is 200. b.plsmod.1 <- bootsempls(pls.pathmod.1, nboot = 200, start = "ones", verbose = TRUE) summary(b.plsmod.1, type = "perc", level = 0.95) names(b.plsmod.1) # Inspection of bootstrap samples (parallel plot). parallel(b.plsmod.1, subset = 1:ncol(b.plsmod.1$t), relinesAt = 0) # Inspecting the path coefficients. parallel(b.plsmod.1, pattern = "beta", reflinesAt = c(0,1)) densityplot(b.plsmod.1, pattern = "beta") # Inspecting the outer loadings. parallel(b.plsmod.1, pattern = "lam") ## Clean up. search() detach("package:semPLS") detach("package:lattice") detach("package:boot") search() ls() rm(inner.mod, mm, outer.mod, pls.pathmod.1, plsm.obj, sm, b.plsmod.1) ls() ################################################################################ # ###################### REFERENCES & RESOURCES ####################### # # Falk, R. F., & Miller, N. B. (1992). A primer for soft modeling. Akron, OH: University of # Akron Press. # # Garson, D. (2011). Partial Least Squares. Statnotes. Accessed May 9, 2011; from: # http://faculty.chass.ncsu.edu/garson/PA765/pls.htm # # Haenlein, M., & Kaplan, A. (2004). A beginner's guide to partial least squares analysis. # Understanding Statistics, 3(4), 283 -- 297. # Available at: http://www.stat.umn.edu/~sandy/courses/8801/articles/pls.pdf # # Lohmoller, J. (1989). Latent variable path modeling with partial least squares. # New York: Springer-Verlag. # # Monecke, A. (2010). Package 'semPLS'. Available at CRAN: # http://cran.r-project.org/web/packages/semPLS/index.html # # Sanchez, G. (2010). Package 'plspm'. Available at CRAN: # http://cran.r-project.org/web/packages/plspm/index.html # # Tenenhaus, M., Vinzi, V. E., Chatelin, Y., & Lauro, C. (2005). PLS path modeling. # Computational Statistics & Data Analysis, 48, 159 -- 205. Available at: www.sciencedirect.com # # Trinchera, L. (2007). Unobserved heterogeneity in structural equation models: A new # approach to latent class detection in PLS path modeling. Doctoral dissertation. # Available at: http://www.fedoa.unina.it/view/people/Trinchera,_Laura.html # END; May 11, 2011.