# # ########## Bayesian Regression / Generalized Linear Models ########## # # # Generate some "simulated" data for the examples below. n <- 500 int <- -5 open <- rnorm(n, 50, 7) agree <- rnorm(n, 100, 15) social <- rnorm(n, 25, 4) e <- rnorm(n, 0, 1) extro <- int + (1.2 * open) + (.9 * agree) + (.4 * social) + e data1 <- data.frame(extro, open, agree, social) summary(data1) cor(data1) rm(n, int, open, agree, social, e, extro) ls() # Confirm / take a look at the core Linear Model (lm) [NOTE: using the optional # arguments "x = TRUE" and "y = TRUE" allows us to access the design matrix "x" # and the response vector "y"; both of which are used in the Bayesian Linear # Regression below]. model.1 <- lm(extro ~ open + agree + social, data = data1, x = TRUE, y = TRUE) summary(model.1) # Same as above; using the Generalized Linear Model (glm) function, specifying the default # Gaussian (normal) family distribution. model.2 <- glm(extro ~ open + agree + social, data = data1, family = "gaussian") summary(model.2) # One of the benefits of the Bayesian perspective (for any analysis) is that it allows us to make 'credible # interval' statements. Credible intervals are similar to confidence intervals, but in the Bayesian # framework, the interval REALLY IS believed to contain the true population parameter. # For instance: a 95% credible interval for a parameter being estimated, from the Bayesian perspective, # is interpreted as; there is a 95% probability that the actual parameter is included in that interval. # This is because the interval is constructed based on information from the posterior distribution; of for instance, # one of the predictor's coefficient posterior distribution (e.g. the open variable's coeffient posterior distribution). ### Bayesian Linear Regression. # Load the package 'LearnBayes' (Albert, 2010). library(LearnBayes) # Conduct the Bayesian Linear Regression using the 'blinreg' function; which returns $beta "a matrix # of simulated draws from the marginal posterior of Beta, where each row is a simulated draw", and # $sigma, which "is a vector of simulated draws from the marginal posterior of Sigma". joint.posterior.samples <- blinreg(model.1$y, model.1$x, 5000) oldpar <- par(oma=c(0,0,3,0), mfrow=c(2,2)) hist(joint.posterior.samples$beta[,1], main = "Intercept", xlab = "beta 0") hist(joint.posterior.samples$beta[,2], main = "Open Predictor", xlab = "beta 1") hist(joint.posterior.samples$beta[,3], main = "Agree Predictor", xlab = "beta 2") hist(joint.posterior.samples$beta[,4], main = "Social Predictor", xlab = "beta 3") par(oldpar) # To display the 95% credible intervals (and medians) from the distributions, use an 'apply' function. apply(joint.posterior.samples$beta, 2, quantile, c(.025, .500, .975)) ### Bayesian Generalized Linear Modeling. # Load the required package: 'arm' (Gelman, et al., 2010). library(arm) # Conduct the Bayesian Generalized linear model (here family = Gaussian, as is default). model.3 <- bayesglm (extro ~ open + agree + social, family = "gaussian", data = data1, prior.scale=Inf, prior.df=Inf) summary(model.3) # The 'bayesglm' function represents a kind of short cut of the Bayesian approach to inference. Typically, # the posterior is not used directly for making inferences. Instead, an empirical distribution is constructed # based on draws from the posterior and that empirical distribution is what informs the inference(s). Here, we # are using the 'bayesglm' as a proxy for doing the added empirical distribution. With the 'bayesglm' we # get a distribution of 'simulates' which are used in place of an actual empirical distribution (which will # be covered further below). # Retrieve the posterior distributions of the coefficients for the intercept and all three predictors. simulates <- coef(sim(model.3)) head(simulates) # Extract just the posterior distribution of the 'open' variable's coefficient. posterior.open <- simulates[,2] head(posterior.open) # Take a look at the posterior distribution of the open variable's coefficient (normally a histogram # would not be used, it is used here simply as a graphical reference). hist(posterior.open) plot(density(posterior.open), main = "", xlab = "Posterior.open", ylab = "Density") # Retrieve the 95% credible interval for the open variable's coefficient. quantile(posterior.open, c(.025, .975)) # To see the entire list of optional arguments (there are many) for the 'bayesglm' function: help(bayesglm) ######################################################################## # Going further to actually creating an empirical distribution based on iterative draws from the # posterior. The 'MCMCregress' function in the package "MCMCpack" provides us with the Markov Chain # Monte Carlo simulation method of creating the empirical distribution; which itself provides us # with the descriptive statistics used for inference. Meaning, the mode, median, or mean of the # empirical MCMC simulates' distribution is the 'maximum likelihood' (i.e. top of a density function) # estimate of the population parameter. And, the credible interval which includes the actual population # parameter value. library(MCMCpack) # Notice, the model formula is the same, but here we have some new options. The 'burnin' is used # because MCMC iterates are sensitive to their initial start values, so the first few (i.e. 3000) # iterations are discarded. The 'mcmc' simply issues how many (post-burnin) iterations will be # used to build the empirical distribution. The 'thin' defaults to 1 and represents a control # on convergence, such that once approximate convergence has been reached it can be beneficial # to keep only a few simulates and discard the rest to conserve computer resources (Gelman, Carlin, # Stern, & Rubin, 2004). The verbose option (by default is off) simply does or does not print # the iteration history as the function runs. The seed argument simply allows the user to set # the random number generator seed. The 'beta.start' argument allows the user to set a start # value for the beta vector. model.4 <- MCMCregress(extro ~ open + agree + social, data = data1, burnin = 3000, mcmc = 10000, thin = 1, verbose = 0, seed = NA, beta.start = NA) summary(model.4) # Notice in the summary, we get the coefficient estimates and credible intervals (in the second # part of the output, labeled "Quantiles for each variable:"). # To see all the optional arguments and their explanations in MCMCregress; help(MCMCregress) ### References / Resources ### # Albert, J. (2007). Bayesian Computation with R. New York: Springer Science+Business Media, LLC. # Albert, J. (2010). Package 'LearnBayes'. Available at CRAN: # http://cran.r-project.org/web/packages/LearnBayes/index.html # Berry, D. A. (1996). Statistics: A Bayesian perspective. Belmont, CA: Wadsworth Publishing Company. # Bolker, B. M. (2008). Ecological Models and Data in R. Princeton, NJ: Princeton University Press. # Bolstad, W. M. (2004). Introduction to Bayesian statistics. Hoboken, NJ: John Wiley & Sons, Inc. # Gelman, A., Carlin, J. B., Stern, H. S., & Rubin, D. B. (2004). Bayesian Data Analysis (2nd ed.). # Boca Raton, FL: Chapman & Hall/CRC. # Gelman, A., Jakulin, A., Pittau, M. G., & Su, Y. (2009). A Weakly Informative Default Prior # Distribution For Logistic And Other Regression Models. The Annals of Applied Statistics, 2(4), # 1360-1383. # Gelman, A., Su, Y., Yajima, M., Hill, J., Pittau, M. G., Kerman, J., & Zheng, T. (2010). Package 'arm'. # Available at: # http://cran.r-project.org/web/packages/arm # Hoff, P. D. (2009). A First Course in Bayesian Statistical Methods. New York: Springer Science+Business # Media, LLC. # Martin, A. D., Quinn, K. M., & Park, J. H. (2010). Package 'MCMCpack'. Available at: # http://cran.r-project.org/web/packages/MCMCpack/MCMCpack.pdf # END: Updated Mar. 8, 2011 (added 'blinreg' function and associated operations.