# # ##### Using Mahalanobis distance to evaluate multivariate outliers. # # First, create a data frame which is multivariate normal. set.seed(201307) n <- 1000 library(MASS) Sigma <- matrix(c(1.0, .80, .50, .20, .80, 1.0, .05, .05, .50, .05, 1.0, .05, .20, .05, .05, 1.0), ncol = 4) x <- mvrnorm(n, Sigma, mu = c(100,100,100,100), empirical = TRUE) detach("package:MASS"); rm(Sigma) df.1 <- data.frame(x); rm(x) names(df.1) <- c("y", "x1", "x2", "x3") summary(df.1) # Next, we insert some multivariate outlier values (here using # cases 50, 150, 250, 500, 750, 850, 950 as the outliers). out <- c(50,150,250,500,750,850,950) df.2 <- df.1 df.2[out,] <- df.2[out,] + c(sd(df.1[,1])*5, sd(df.1[,2])*5, sd(df.1[,3])*5, sd(df.1[,4])*5) df.1[out,] df.2[out,] #### Mahalanobis distance. ## Robust Mahalanobis distance. # Using library(chemometrics) and Moutlier function; note: there are several # dependent packages. library(chemometrics) # Next, get the Mahalanobis distances for each case of our current data. md.1 <- Moutlier(df.1, quantile = 0.99, plot = FALSE) md.1\$cutoff summary(md.1\$md) qqplot(md.1\$md, df.1\$y, plot.it = TRUE, xlab = "Mahalanobis' distance", ylab = "y", main = "DF.1") # Cases with extreme Mahalanobis distances (upper right) are likely to be # true multivariate outliers. # Now with the second data frame. md.2 <- Moutlier(df.2, quantile = 0.99, plot = FALSE) md.2\$cutoff summary(md.2\$md) qqplot(md.1\$md, df.2\$y, plot.it = TRUE, xlab = "Mahalanobis' distance", ylab = "y", main = "DF.2") # Cases with extreme Mahalanobis distances (upper right) are likely to be # true multivariate outliers. # Compare the two plots.... par(mfrow = c(1,2)) hist(md.1\$md, main = "DF.1") hist(md.2\$md, main = "DF.2") # Top 6 extreme values. head(sort(md.1\$md, decreasing = TRUE)) head(sort(md.2\$md, decreasing = TRUE)) # To identify those cases; simply use a which command with the 'cutoff' # value provided by the which(md.1\$md > md.1\$cutoff) which(md.2\$md > md.2\$cutoff) # End; 2013.07.11