# # ######## A few Graphic Examples of Simpson's Paradox, in 100 lines or less. # # Failure to recognize clusters or groups within the data may lead to # strikingly wrong conclusions about the relationships among the variables. # http://en.wikipedia.org/wiki/Simpson%27s_paradox # library(car) N <- 300 n <- 100 group <- c(rep("low", n), rep("med", n), rep("high", n)) X <- rnorm(N, 100, 15); Y <- .8*X + rnorm(N, 100, 15); cor(X,Y) scatterplot(X,Y, smooth = FALSE) dev.new() scatterplot(X,Y, ellipse = FALSE, groups = group, legend.plot = TRUE) ###### graphics.off() x1 <- rnorm(n, 50, 15); y1 <- -.6*x1 + rnorm(n, 50, 30); cor(x1,y1) x2 <- rnorm(n, 100, 15); y2 <- -.6*x2 + rnorm(n, 100, 30); cor(x2,y2) x3 <- rnorm(n, 150, 15); y3 <- -.6*x3 + rnorm(n, 150, 30); cor(x3,y3) X <- c(x1, x2, x3); Y <- c(y1, y2, y3); cor(X,Y) scatterplot(X,Y, smooth = FALSE) dev.new() scatterplot(X,Y, ellipse = FALSE, groups = group, legend.plot = TRUE) ###### graphics.off() x1 <- rnorm(n, 50, 15); y1 <- -.6*x1 + rnorm(n, 50, 20); cor(x1,y1) x2 <- rnorm(n, 100, 15); y2 <- -.6*x2 + rnorm(n, 100, 20); cor(x2,y2) x3 <- rnorm(n, 150, 15); y3 <- -.6*x3 + rnorm(n, 150, 20); cor(x3,y3) X <- c(x1, x2, x3); Y <- c(y1, y2, y3); cor(X,Y) scatterplot(X,Y, smooth = FALSE) dev.new() scatterplot(X,Y, ellipse = FALSE, groups = group, legend.plot = TRUE) ###### graphics.off() x1 <- rnorm(n, 50, 15); y1 <- -.6*x1 + rnorm(n,50,10); cor(x1,y1) x2 <- rnorm(n, 100, 15); y2 <- -.6*x2 + rnorm(n,100,10); cor(x2,y2) x3 <- rnorm(n, 150, 15); y3 <- -.6*x3 + rnorm(n,150,10); cor(x3,y3) X <- c(x1, x2, x3); Y <- c(y1, y2, y3); cor(X,Y) scatterplot(X,Y, smooth = FALSE) dev.new() scatterplot(X,Y, ellipse = FALSE, groups = group, legend.plot = TRUE) ###### graphics.off() x1 <- rnorm(n, 50, 15); y1 <- -.6*x1 + rnorm(n,50,5); cor(x1,y1) x2 <- rnorm(n, 100, 15); y2 <- -.6*x2 + rnorm(n,100,5); cor(x2,y2) x3 <- rnorm(n, 150, 15); y3 <- -.6*x3 + rnorm(n,150,5); cor(x3,y3) X <- c(x1, x2, x3); Y <- c(y1, y2, y3); cor(X,Y) scatterplot(X,Y, smooth = FALSE) dev.new() scatterplot(X,Y, ellipse = FALSE, groups = group, legend.plot = TRUE) ###### graphics.off() x1 <- rnorm(n, 50, 15); y1 <- -.6*x1 + rnorm(n,50,1); cor(x1,y1) x2 <- rnorm(n, 100, 15); y2 <- -.6*x2 + rnorm(n,100,1); cor(x2,y2) x3 <- rnorm(n, 150, 15); y3 <- -.6*x3 + rnorm(n,150,1); cor(x3,y3) X <- c(x1, x2, x3); Y <- c(y1, y2, y3); cor(X,Y) scatterplot(X,Y, smooth = FALSE) dev.new() scatterplot(X,Y, ellipse = FALSE, groups = group, legend.plot = TRUE) ###### graphics.off() rm(group,N,n,x1,x2,x3,y1,y2,y3,X,Y) detach("package:car");detach("package:MASS");detach("package:nnet") search() ls() ###### # # References & Resources. # # Bickel, P. J., Hammel, E. A., & O'Connell. (1975). Sex bias in graduate # admissions: Data from Berkeley. Science, 187, 398 - 404. # Blyth, C. R. (1972). On Simpson's Paradox and the Sure-Thing Principle. # Journal of the American Statistical Association, 67, 364 - 366. # Clifford, W. H. (1982). Simpson's paradox in real life. The American # Statistician, 36, 46 - 48. # Fox, J., et al., (2012). Package 'car': A companion package to An R # Companion to Applied Regression, (2nd Ed.), Sage, 2011. # Kievit, R., & Epskamp. S. (2012). Package 'Simpsons': A package for # detecting Simpson's Paradox. # Pearl, J. (2009). Causality: Models, reasoning, and inference (2nd ed.). # [Chapter 6: Simpson's paradox, confounding, and collapsibility.] New York: # Cambridge University Press. # Ripley, B., Venables, B., Hornik, K., Gebhardt, A., & Firth, D. (2012). # Package 'MASS': Functions and datasets to support Venables and Ripley, # 'Modern Applied Statistics with S' (4th ed.), 2002. # Ripley, B. (2012). Package 'nnet': Software for feed-forward neural networks # with a single hidden layer, and for multinomial log-linear models. # Robinson, W. S. (1950). Ecological correlations and the behavior of # individuals. American Sociological Review, 15, 351 - 357. # Rucker, G., & Schumacher, M. (2008). Simpson's paradox visualized: The # example of the Rosiglitazone meta-analysis. BMC Medical Research # Methodology, 8:34. DOI: 10.1186/1471-2288-8-34 # Simpson, E. H. (1951). The interpretation of interaction in contingency # tables. Journal of the Royal Statistical Society (Series B: # Methodological), 13, 238 - 241. # # # # Updated December 11, 2012.