#
#
########## Partial Least Squares Examples. ##########
#
# Import the (simulated) data.
pls.data <- read.table("http://www.unt.edu/rss/class/Jon/R_SC/Module8/PLSdata001.txt",
header=TRUE, sep=",", na.strings="NA", dec=".", strip.white=TRUE)
# Calculate covariance matrix of the 20 variables.
cov.m <- cov(pls.data[,4:23])
cov.m
head(pls.data)
round(cor(pls.data[,3:23]), 3)
##################################################################################
# Check the relationships using traditional SEM.
library(sem)
measurement.model <- specifyModel()
F1 -> v1, lam11, NA
F1 -> v2, lam12, NA
F2 -> v3, lam21, NA
F2 -> v4, lam22, NA
F2 -> v5, lam23, NA
F3 -> v6, lam31, NA
F3 -> v7, lam32, NA
F3 -> v8, lam33, NA
F3 -> v9, lam34, NA
F3 -> v10, lam35, NA
F3 -> v11, lam36, NA
F4 -> v12, lam41, NA
F4 -> v13, lam42, NA
F4 -> v14, lam43, NA
F4 -> v15, lam44, NA
F5 -> v16, lam51, NA
F5 -> v17, lam52, NA
F5 -> v18, lam53, NA
F5 -> v19, lam54, NA
F5 -> v20, lam55, NA
v1 <-> v1, var1, NA
v2 <-> v2, var2, NA
v3 <-> v3, var3, NA
v4 <-> v4, var4, NA
v5 <-> v5, var5, NA
v6 <-> v6, var6, NA
v7 <-> v7, var7, NA
v8 <-> v8, var8, NA
v9 <-> v9, var9, NA
v10 <-> v10, var10, NA
v11 <-> v11, var11, NA
v12 <-> v12, var12, NA
v13 <-> v13, var13, NA
v14 <-> v14, var14, NA
v15 <-> v15, var15, NA
v16 <-> v16, var16, NA
v17 <-> v17, var17, NA
v18 <-> v18, var18, NA
v19 <-> v19, var19, NA
v20 <-> v20, var20, NA
F1 <-> F2, cov1, NA
F1 <-> F3, cov2, NA
F1 <-> F4, cov3, NA
F1 <-> F5, cov4, NA
F2 <-> F3, cov5, NA
F2 <-> F4, cov6, NA
F2 <-> F5, cov7, NA
F3 <-> F4, cov8, NA
F3 <-> F5, cov9, NA
F4 <-> F5, cov10, NA
F1 <-> F1, NA, 1
F2 <-> F2, NA, 1
F3 <-> F3, NA, 1
F4 <-> F4, NA, 1
F5 <-> F5, NA, 1
# Unfortunately, the model as specified does not converge.
sem.model.1 <- sem(measurement.model, cov.m, 1000, maxiter = 10000)
summary(sem.model.1, conf.level=0.95)
# It still does not converge after making some adjustments to the arguments.
sem.model.2 <- sem(measurement.model, cov.m, 1000, maxiter = 10000,
analytic.gradient = FALSE, par.size = c("ones"))
summary(sem.model.2, conf.level=0.95)
# Cleaning up the workspace.
ls()
rm(sem.model.1, sem.model.2, measurement.model, cov.m)
ls()
detach("package:sem")
detach("package:MASS")
detach("package:matrixcalc")
#########################################
#
## Partial Least Squares (PLS) modeling is often an alternative to traditional
## modeling techniques. Unlike traditional modeling techniques which rely upon
## covariance decomposition, PLS is a variance based (or components based)
## technique and does not carry with it many of the assumptions of covariance
## methods (i.e. distributional assumptions).
## It is sometimes considered an analysis of last resort because large samples
## are not as necessary with it, and PLS is less sensitive to multicollinearity.
## PLS is also quite versitile; it can be used as a regression technique,
## a principal components technique, a canonical correlation technique, or a path
## modeling (or structural equation modeling) technique. However, it is well
## documented that PLS is biased because the optimization is local rather than
## global level. As sample size increases PLS becomes less bias; however, PLS is
## often used when other methods fail (i.e. a slightly biased estimate is better
## than no estimate).
# Package 'plsdepot' (Partial Least Squares Data Analysis).
library(plsdepot)
######
# PLS regression: Using PLS with one outcome variable (i.e. dependent variable).
# First, create matrix or data frame of the predictors.
pred <- pls.data[,4:8]
# Here we are using a composite or summary score as the dependent variable.
attach(pls.data)
f3 <- v6 + v7 + v8 + v9 + v10 + v11
detach(pls.data)
# Run the PLS regression; nc = number of extracted PLS components (default is 2),
# cv = whether cross-validation should be performed (default is FALSE).
pls.reg <- plsreg1(predictors = pred, response = f3, comps = 2, crosval = FALSE)
pls.reg
# Plot of the results (after reviewing, you may want to close all the graphics windows).
plot(pls.reg)
# Standardized coefficients, un-standardized coefficients, and R-squared.
pls.reg$std.coefs
pls.reg$reg.coefs
pls.reg$R2
# Histogram of the residuals.
hist(pls.reg$resid)
# Cleaning up the workspace.
ls()
rm(f3, pls.reg, exprs)
graphics.off()
ls()
#####
# PLS regression WITH multiple outcome variables (aka. dependent variables).
# This is very similar to canonical correlation analysis, except; here we
# are postulating that X causes or predicts Y. In canonical correlation
# analysis there is only a 'relationship' being specified (i.e. non-directional).
# Here we will reuse the 'pred' object created above (for our predictors).
# And we will use only two variables (v6 & v7) as our outcome variables.
outco <- pls.data[,9:10]
# Run the PLS multiple outcome regression.
pls.mreg <- plsreg2(predictors = pred, responses = outco, comps = 2, crosval = TRUE)
pls.mreg
# Plot of the results (after reviewing, you may want to close all the graphics windows).
plot(pls.mreg)
pls.mreg$std.coef
pls.mreg$reg.coef
pls.mreg$Q2
hist(pls.mreg$resid)
# Cleaning the workspace.
ls()
rm(pred, outco, pls.mreg)
graphics.off()
ls()
#####
# Non-linear Iterative Partial Least Squares; performs a principal components
# analysis (PCA) with NIPALS algorithm (nc = number of components).
nipals.1 <- nipals(Data = pls.data[,4:23], comps = 5, scaled = TRUE)
nipals.1
nipals.1$values
nipals.1$scores
nipals.1$loadings
nipals.1$cos
# Plot of the results (after reviewing, you may want to close all the graphics windows).
plot(nipals.1)
# Cleaning the workspace.
ls()
rm(nipals.1)
graphics.off()
ls()
#####
# PLS-CA: Partial Least Squares Canonical Analysis.
# Performs partial least squares canonical analysis for TWO blocks of data. Compared
# to PLSR2, the blocks of variables in PLS-CA play a symmetrical role (i.e. there
# is neither predictors or predictands).
subset.x <- data.frame(pls.data[,4:8])
subset.y <- data.frame(pls.data[,9:14])
plsca.1 <- plsca(X = subset.x, Y = subset.y, comps = NULL, scaled = TRUE)
plsca.1
# Results can be retrieved using some simple commands; keep in mind, the
# "X" variate is referred to as 't' and the "Y" variate is referred to as 'u'.
# The correlations between each 'x' variable and 't'.
plsca.1$cor.xt
# The correlations between each 'y' variable and 'u'.
plsca.1$cor.yu
# The Canonical Correlation (between 't' and 'u' for each canonical solution).
plsca.1$cor.tu
# The explained variance of 'x' by 't'.
plsca.1$R2X
# The explained variance of 'y' by 't'.
plsca.1$R2Y
# Plot of the results (after reviewing, you may want to close all the graphics windows).
plot(plsca.1)
# Cleaning the workspace.
ls()
rm(plsca.1, subset.x, subset.y)
graphics.off()
detach("package:plsdepot")
ls()
#####
# PLS Path Modeling.
library(plspm)
### Gathering the necessary objects for the PLS Path Model.
# First, create the matrix which expresses the inner (structural) model; this model
# simply shows the relationships among the latent variables; where the column variable
# 'causes' the row variable if a 'one' is in the intersecting cell (e.g. f1 and f2
# cause f3 --> columns 1 and 2 cause row 3).
inner.matrix <- matrix(c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0,
1, 1, 0, 0, 0,
0, 1, 1, 0, 0,
0, 0, 1, 1, 0), 5, 5, byrow = TRUE)
dimnames(inner.matrix) <- list(c("f1", "f2", "f3", "f4", "f5"),
c("f1", "f2", "f3", "f4", "f5"))
inner.matrix
# Next, create the list which expresses the outer (measurement) model; this
# model simply shows the relationships between the manifest variables and the
# latent variables (e.g. variables v1 and v2 are related to the first factor [f1]).
outer.list <- list(c(1,2), c(3,4,5), c(6,7,8,9,10,11), c(12,13,14,15), c(16,17,18,19,20))
outer.list
# Next, create the vector which identifies what "mode" of indicators are used (i.e. "A" for
# reflective measurement or "B" for formative measurement). Recall, 'Reflective' measurement
# is said to occur when each manifest variable is "caused by" a latent variable and
# 'Formative' measurement is said to occur when each manifest variable "causes" the
# latent variable. Below, all 5 latent variables in our model are "reflectively"
# measured (i.e. each latent causes the observed scores on the manifest variables).
mode.vec <- c("A", "A", "A", "A", "A")
# Finally, we can run the Partial Least Squares Path Model.
pls.model.1 <- plspm(Data = pls.data[,4:23], path_matrix = inner.matrix,
blocks = outer.list, modes = mode.vec,
scheme = "factor", scaled = TRUE, tol = 0.00001, maxiter = 100)
pls.model.1
# The 'summary' function provides a very thorough summary with labels.
summary(pls.model.1)
names(pls.model.1)
# You can even create a path diagram.
plot(pls.model.1)
# PLS path modeling WITH bootstrapped validation.
pls.model.2 <- plspm(Data = pls.data[,4:23], path_matrix = inner.matrix, blocks = outer.list, modes = mode.vec,
scheme = "factor", scaled = TRUE, boot.val = TRUE, br = 200, tol = 0.00001, maxiter = 100)
pls.model.2
# When looking at the "BOOTSTRAP VALIDATION" part of the results summary, notice the
# 'Original' values are the same as the 'Mean.Boot' values (which should be the case).
# The 'Std.Error' (Standard Error) represents the bias associated with the estimates;
# notice the confidence intervals for each bootstrapped estimate.
summary(pls.model.2)
##
# Detaching package 'plspm' and its dependencies.
detach("package:plspm")
detach("package:amap")
detach("package:diagram")
detach("package:shape")
# Cleaning the workspace.
ls()
rm(inner.matrix, mode.vec, outer.list, pls.model.1, pls.model.2)
graphics.off()
ls()
################################################################################
# Moving on to the 'semPLS' package, which offers very similar functions for
# conducting PLS path modeling.
# First, create a data frame for the structural (also called inner) model. This
# model simply shows the relationships among the latent variables.
from <- c("f1", "f2", "f2", "f3", "f3", "f4")
to <- c("f3", "f3", "f4", "f4", "f5", "f5")
inner.mod <- data.frame(from, to)
inner.mod
# If desired, write the 'inner.mod' data frame out to the working directory as a comma separated
# values (.csv) file.
# write.table(inner.mod, "C:/Users/jds0282/Desktop/Workstuff/Jon_R/Example Data/inner.mod.csv",
# sep=",", col.names=TRUE, row.names=FALSE, quote=TRUE, na="NA")
# Remove the unnecessary objects from above.
rm(from, to)
# Next, create a data frame for the measurement (also called outer) model. This
# model specifies the manifest variables relationships to the latent variables.
from <- c("f1", "f1", "f2", "f2", "f2", "f3", "f3", "f3", "f3", "f3", "f3",
"f4", "f4", "f4", "f4", "f5", "f5", "f5", "f5", "f5")
to <- c("v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20")
outer.mod <- data.frame(from, to)
outer.mod
# If desired, write the 'outer.mod' data frame out to the working directory as a comma separated
# values (.csv) file.
# write.table(outer.mod, "C:/Users/jds0282/Desktop/Workstuff/Jon_R/Example Data/outer.mod.csv",
# sep=",", col.names=TRUE, row.names=FALSE, quote=TRUE, na="NA")
# Clean up the workspace.
rm(from, to)
# Load the 'semPLS' library which contains several functions for conducting Partial
# Least Squares analysis.
library(semPLS)
# Next, convert the two model data.frames into matrices; structural model and
# measurement model.
sm <- as.matrix(inner.mod)
sm
mm <- as.matrix(outer.mod)
mm
# Now we can create a single 'plsm' object which will be used to run the PLS path
# model.
plsm.obj <- plsm(data = pls.data[,4:23], strucmod = sm, measuremod = mm)
plsm.obj
# Now, we can take the 'plsm' object and submit it to the 'sempls' function to
# actually run the Partial Least Squares Path Model (or later, the bootstrapped
# version).
pls.pathmod.1 <- sempls(plsm.obj, data = pls.data[,4:23], maxit = 20, tol = 1e-7, scaled = TRUE,
sum1 = FALSE, E = "A", pairwise = FALSE, method = "pearson", convCrit = "relative")
summary(pls.pathmod.1)
names(pls.pathmod.1)
pls.pathmod.1
densityplot(pls.pathmod.1)
densityplot(pls.pathmod.1, use = "prediction")
densityplot(pls.pathmod.1, use = "residuals")
pls.pathmod.1$outer_weights
pls.pathmod.1$outer_loading
pls.pathmod.1$path_coefficients
pls.pathmod.1$total_effects
rSquared(pls.pathmod.1)
# The 'outer_loadings' from above are virtually the same as was produced with
# the 'plspm' function of the 'plspm' package above.
#####
# Bootstrapped PLS using an object from the 'sempls' function (as directly above).
# The default number of bootstrapped samples is 200.
b.plsmod.1 <- bootsempls(pls.pathmod.1, nboot = 200, start = "ones", verbose = TRUE)
summary(b.plsmod.1, type = "perc", level = 0.95)
names(b.plsmod.1)
# Inspection of bootstrap samples (parallel plot).
parallelplot(b.plsmod.1, subset = 1:ncol(b.plsmod.1$t), relinesAt = 0)
# Inspecting the path coefficients.
parallelplot(b.plsmod.1, pattern = "beta", reflinesAt = c(0,1))
densityplot(b.plsmod.1, pattern = "beta")
# Inspecting the outer loadings.
parallelplot(b.plsmod.1, pattern = "lam")
## Clean up.
search()
detach("package:semPLS")
detach("package:lattice")
detach("package:boot")
search()
ls()
rm(inner.mod, mm, outer.mod, pls.pathmod.1, plsm.obj, sm, b.plsmod.1)
graphics.off()
ls()
################################################################################
#
###################### REFERENCES & RESOURCES #######################
#
# Falk, R. F., & Miller, N. B. (1992). A primer for soft modeling. Akron, OH: University of
# Akron Press.
#
# Garson, D. (2011). Partial Least Squares. Statnotes. Accessed May 9, 2011; from:
# http://faculty.chass.ncsu.edu/garson/PA765/pls.htm
#
# Haenlein, M., & Kaplan, A. (2004). A beginner's guide to partial least squares analysis.
# Understanding Statistics, 3(4), 283 -- 297.
# Available at: http://www.stat.umn.edu/~sandy/courses/8801/articles/pls.pdf
#
# Lohmoller, J. (1989). Latent variable path modeling with partial least squares.
# New York: Springer-Verlag.
#
# Monecke, A. (2010). Package 'semPLS'. Available at CRAN:
# http://cran.r-project.org/web/packages/semPLS/index.html
#
# Sanchez, G. (2010). Package 'plspm'. Available at CRAN:
# http://cran.r-project.org/web/packages/plspm/index.html
#
# Tenenhaus, M., Vinzi, V. E., Chatelin, Y., & Lauro, C. (2005). PLS path modeling.
# Computational Statistics & Data Analysis, 48, 159 -- 205. Available at: www.sciencedirect.com
#
# Trinchera, L. (2007). Unobserved heterogeneity in structural equation models: A new
# approach to latent class detection in PLS path modeling. Doctoral dissertation.
# Available at: http://www.fedoa.unina.it/view/people/Trinchera,_Laura.html
# END; Oct 11, 2013.