# # ############### 'Some' useful Slicing and Dicing of Data ############### # # # This script assumes you have worked through all the previous notes from # the web page and you have downloaded, installed, and updated all available # R packages. # Slicing and dicing functions are often useful when dealing with data which is in Long # format and one needs the data in Wide format, or vice versus. Long format refers to # data in which each observation or participant has multiple rows. Wide format refers to # data in which each observation or participant has only one row. # Creating a data file for an example of extracting specific elements from a data file. x1.a <- rep(1, 10) x1.b <- rep(2, 10) x1.c <- rep(3, 10) x1.d <- rep(4, 10) x1.e <- rep(5, 10) x1.f <- rep(6, 10) x1.g <- rep(7, 10) x1.h <- rep(8, 10) x1.i <- rep(9, 10) x1.j <- rep(10,10) participant.id <- c(x1.a, x1.b, x1.c, x1.d, x1.e, x1.f, x1.g, x1.h, x1.i, x1.j) x1.numbers <- rep(seq(1:10), 10) x1.let <- factor(LETTERS[1:10]); names(x1.let) <- letters[1:10] x1.letters <- rep(x1.let, 10) x2.numbers <- rep(seq(1:5), 20) x2.let <- factor(LETTERS[1:5]); names(x2.let) <- letters[1:5] x2.letters <- rep(x2.let, 20) x3.numbers <- rep(seq(1:2), 50) x3.let <- factor(LETTERS[1:2]); names(x3.let) <- letters[1:2] x3.letters <- rep(x3.let, 50) x4 <- rnorm(100) x5 <- rnorm(100, 100, 15) x6 <- rnorm(100, 5, 1) WL.data <- data.frame(participant.id, x1.numbers, x1.letters, x2.numbers, x2.letters, x3.numbers, x3.letters, x4, x5, x6) # Cleaning up the workspace (house cleaning) by removing all the things we created to # make the data.frame. ls() rm(participant.id,x1.a,x1.b,x1.c,x1.d,x1.e,x1.f,x1.g,x1.h,x1.i,x1.j,x1.let,x1.letters,x1.numbers, x2.let,x2.letters,x2.numbers,x3.let,x3.letters,x3.numbers,x4,x5,x6) ls() ##################################################################################### # Take a look at the created data WL.data head(WL.data, n = 10) # The following can identify individuals with value "B" on variable "x2.letter" of data frame "WL.data". subset.1 <- which(WL.data$x2.letter == "B") subset.1 # The following gives a summary of all columns (variables) by "x2.letters". subset.2 <- by(WL.data[ , 1:10], WL.data$x2.letters, summary) subset.2 names(subset.2) subset.2$B # Creates a new data frame with just the participant's row data for the rows # having a "C" on "x2.letters" in "WL.data". subset.3 <- data.frame(WL.data[which(WL.data$x2.letter == "C"), 1:10]) subset.3 # Creates a new data frame with just the participant's row data for the rows # having a "B" on "x3.letters" in "WL.data". subset.4 <- data.frame(WL.data[which(WL.data$x3.letter == "B"), 1:10]) subset.4 # Create a new data frame with just the participant's row data for the rows # having a "C" on "x2.letters" in "WL.data" and the columns of the new data frame # include x4, x5, x6 (which are columns 8, 9, 10 in WL.data). subset.5 <- data.frame(WL.data[which(WL.data$x2.letter == "C"), 8:10]) subset.5 # Same as above (subset.5), accept the new data frame contains participant.id (column 1) as well. subset.5a <- data.frame(WL.data[which(WL.data$x2.letter == "C"), c(1,8:10)]) subset.5a # Create a new data frame with just the participant's row data for the rows # having a "B" on "x3.letters" in "WL.data" and the columns of the new data frame # include pariticipant.id, x4, and x6 (which are columns 1, 8, and 10 in WL.data). subset.6 <- data.frame(WL.data[which(WL.data$x3.letter == "B"), c(1,8,10)]) subset.6 ########################################################################### ### It can often take three or more steps to get a total score for a participant. ## Extract all data for just participant 1. p1 <- data.frame(WL.data[which(WL.data$participant.id == "1"), 1:10]) p1 ## Extract just the rows which have values of B on x3.letters for participant 1. p1.x3B <- data.frame(p1[which(p1$x3.letters == "B"), 1:10]) p1.x3B ## Create a total score for variable x4 for just participant 1 for just the ## rows with values of B on x3.letters. p1.x3B.x4total <- sum(p1.x3B$x4) p1.x3B.x4total ## Or, create a total score for the sum of x4, x5, x6 at just the row for the value ## of "H" on x1.letters. p1.x3B.x1H.456total <- sum(p1.x3B[1,8], p1.x3B[1,9], p1.x3B[1,10]) p1.x3B.x1H.456total # Same as above. p1.x3B.x1H.456total2 <- p1.x3B[1,8] + p1.x3B[1,9] + p1.x3B[1,10] p1.x3B.x1H.456total2 ############################################################################# # Creates a data frame with the sum of "x4" for each value of "x3.letters". subset.7 <- aggregate(WL.data[,c("x4"), drop=FALSE], by=list(x3.letters=WL.data$x3.letters), FUN=sum) subset.7 # Creates a data frame with the sum of "x4" for each value of "participant.id". subset.8 <- aggregate(WL.data[,c("x4"), drop=FALSE], by=list(participant.id=WL.data$participant.id), FUN=sum) subset.8 # Creates a data frame with the sum of "x4" for each value of "participant.id" by each # value of "x3.letters". subset.9 <- aggregate(WL.data[,c("x4"), drop=FALSE], by=list(participant.id=WL.data$participant.id, x3.letters=WL.data$x3.letters), FUN=sum) subset.9 # Recall, each participant has 5 "A" and 5 "B" (on x3.letters). head(WL.data, n = 22) ############################################################################# # Creates a Long data frame with the sum of "x6" for each "participant.id" by each # value of "x3.letters" (A & B). subset.10L <- aggregate(WL.data[,c("x6"), drop=FALSE], by=list(participant.id=WL.data$participant.id, x3.letters=WL.data$x3.letters), FUN=sum) subset.10L # Re-format the data set "subset.10" changing the format from Long to Wide. subset.10W <- data.frame(subset.10L[1:10, ], subset.10L[11:20, ]) subset.10W mean(subset.10W$x6) mean(subset.10W$x6.1) ######################################################################################### ######################################################################################### ############### Long to Wide and Wide to Long ############### # The WL.data file is originally in long format. head(WL.data) # Creating a new data file which removes the unnecessary columns between x1.letters and x3.letters. WL.data.1 <- WL.data[ , -c(3:7)] head(WL.data.1) ###### Going from Long format to Wide format. # The following creates a data.frame in which each row is one participant with each x4, x5, x6 score # for that participant at each x1.numbers. Recall, each participant has 10 values of x1.numbers (and # therefore, 10 x1.letters); so, each participant in the newly created data frame has 10 columns for x4 # scores, 10 columns for x5 scores, and 10 columns for x6 scores. This preserves all the data from the # original "WL.data.1" file. wide.x1 <- reshape(WL.data.1, v.names = c("x4","x5","x6"), idvar = "participant.id", timevar = "x1.numbers", direction = "wide") head(wide.x1) # The 'wide.x1' data file now allows us to deal with the x4, x5, or x6 scores for each participant, by # each value of x1.numbers we want. For instance, if we wanted the difference in x5 scores # between the first and last values of x2 for each participant; then we would use the following; which # returns a vector of 10 values, one value for each participant which represents the difference left from # subtracting each participant's x5 score (at time 1) from each participant's x5 score (at time 10). diff <- wide.x1$x5.1 - wide.x1$x5.10 diff ##### Going from Wide format to Long format. # Going from wide to long is easy because all the attributes of the data are still available # from the first reshape. long.x1 <- reshape(wide.x1) head(long.x1) # Original 'WL.data' example. wide2.x1 <- reshape(WL.data, v.names = c("x4","x5","x6"), drop = c("x1.letters","x2.numbers","x2.letters", "x3.numbers","x3.letters"),idvar = "participant.id", timevar = "x1.numbers", direction = "wide") head(wide2.x1) long2.x1 <- reshape(wide2.x1) head(long2.x1) # # For a better understanding of identifying or extracting elements of a vector, matrix, or list; # see the section "Slicing and extracting data" in the R Reference Card (Short, 2004) posted on the web site. # For a better understanding of the 'reshape' function and # all of its arguments: help(reshape) # It may also be helpful to consult the doBy package. library(doBy) help(doBy)