#
#
############### 'Some' useful Slicing and Dicing of Data ###############
#
#
# This script assumes you have worked through all the previous notes from
# the web page and you have downloaded, installed, and updated all available
# R packages.
# Slicing and dicing functions are often useful when dealing with data which is in Long
# format and one needs the data in Wide format, or vice versus. Long format refers to
# data in which each observation or participant has multiple rows. Wide format refers to
# data in which each observation or participant has only one row.
# Creating a data file for an example of extracting specific elements from a data file.
x1.a <- rep(1, 10)
x1.b <- rep(2, 10)
x1.c <- rep(3, 10)
x1.d <- rep(4, 10)
x1.e <- rep(5, 10)
x1.f <- rep(6, 10)
x1.g <- rep(7, 10)
x1.h <- rep(8, 10)
x1.i <- rep(9, 10)
x1.j <- rep(10,10)
participant.id <- c(x1.a, x1.b, x1.c, x1.d, x1.e, x1.f, x1.g, x1.h, x1.i, x1.j)
x1.numbers <- rep(seq(1:10), 10)
x1.let <- factor(LETTERS[1:10]); names(x1.let) <- letters[1:10]
x1.letters <- rep(x1.let, 10)
x2.numbers <- rep(seq(1:5), 20)
x2.let <- factor(LETTERS[1:5]); names(x2.let) <- letters[1:5]
x2.letters <- rep(x2.let, 20)
x3.numbers <- rep(seq(1:2), 50)
x3.let <- factor(LETTERS[1:2]); names(x3.let) <- letters[1:2]
x3.letters <- rep(x3.let, 50)
x4 <- rnorm(100)
x5 <- rnorm(100, 100, 15)
x6 <- rnorm(100, 5, 1)
WL.data <- data.frame(participant.id, x1.numbers, x1.letters, x2.numbers, x2.letters, x3.numbers, x3.letters,
x4, x5, x6)
# Cleaning up the workspace (house cleaning) by removing all the things we created to
# make the data.frame.
ls()
rm(participant.id,x1.a,x1.b,x1.c,x1.d,x1.e,x1.f,x1.g,x1.h,x1.i,x1.j,x1.let,x1.letters,x1.numbers,
x2.let,x2.letters,x2.numbers,x3.let,x3.letters,x3.numbers,x4,x5,x6)
ls()
#####################################################################################
# Take a look at the created data
WL.data
head(WL.data, n = 10)
# The following can identify individuals with value "B" on variable "x2.letter" of data frame "WL.data".
subset.1 <- which(WL.data$x2.letter == "B")
subset.1
# The following gives a summary of all columns (variables) by "x2.letters".
subset.2 <- by(WL.data[ , 1:10], WL.data$x2.letters, summary)
subset.2
names(subset.2)
subset.2$B
# Creates a new data frame with just the participant's row data for the rows
# having a "C" on "x2.letters" in "WL.data".
subset.3 <- data.frame(WL.data[which(WL.data$x2.letter == "C"), 1:10])
subset.3
# Creates a new data frame with just the participant's row data for the rows
# having a "B" on "x3.letters" in "WL.data".
subset.4 <- data.frame(WL.data[which(WL.data$x3.letter == "B"), 1:10])
subset.4
# Create a new data frame with just the participant's row data for the rows
# having a "C" on "x2.letters" in "WL.data" and the columns of the new data frame
# include x4, x5, x6 (which are columns 8, 9, 10 in WL.data).
subset.5 <- data.frame(WL.data[which(WL.data$x2.letter == "C"), 8:10])
subset.5
# Same as above (subset.5), accept the new data frame contains participant.id (column 1) as well.
subset.5a <- data.frame(WL.data[which(WL.data$x2.letter == "C"), c(1,8:10)])
subset.5a
# Create a new data frame with just the participant's row data for the rows
# having a "B" on "x3.letters" in "WL.data" and the columns of the new data frame
# include pariticipant.id, x4, and x6 (which are columns 1, 8, and 10 in WL.data).
subset.6 <- data.frame(WL.data[which(WL.data$x3.letter == "B"), c(1,8,10)])
subset.6
###########################################################################
### It can often take three or more steps to get a total score for a participant.
## Extract all data for just participant 1.
p1 <- data.frame(WL.data[which(WL.data$participant.id == "1"), 1:10])
p1
## Extract just the rows which have values of B on x3.letters for participant 1.
p1.x3B <- data.frame(p1[which(p1$x3.letters == "B"), 1:10])
p1.x3B
## Create a total score for variable x4 for just participant 1 for just the
## rows with values of B on x3.letters.
p1.x3B.x4total <- sum(p1.x3B$x4)
p1.x3B.x4total
## Or, create a total score for the sum of x4, x5, x6 at just the row for the value
## of "H" on x1.letters.
p1.x3B.x1H.456total <- sum(p1.x3B[1,8], p1.x3B[1,9], p1.x3B[1,10])
p1.x3B.x1H.456total
# Same as above.
p1.x3B.x1H.456total2 <- p1.x3B[1,8] + p1.x3B[1,9] + p1.x3B[1,10]
p1.x3B.x1H.456total2
#############################################################################
# Creates a data frame with the sum of "x4" for each value of "x3.letters".
subset.7 <- aggregate(WL.data[,c("x4"), drop=FALSE],
by=list(x3.letters=WL.data$x3.letters), FUN=sum)
subset.7
# Creates a data frame with the sum of "x4" for each value of "participant.id".
subset.8 <- aggregate(WL.data[,c("x4"), drop=FALSE],
by=list(participant.id=WL.data$participant.id), FUN=sum)
subset.8
# Creates a data frame with the sum of "x4" for each value of "participant.id" by each
# value of "x3.letters".
subset.9 <- aggregate(WL.data[,c("x4"), drop=FALSE],
by=list(participant.id=WL.data$participant.id, x3.letters=WL.data$x3.letters),
FUN=sum)
subset.9
# Recall, each participant has 5 "A" and 5 "B" (on x3.letters).
head(WL.data, n = 22)
#############################################################################
# Creates a Long data frame with the sum of "x6" for each "participant.id" by each
# value of "x3.letters" (A & B).
subset.10L <- aggregate(WL.data[,c("x6"), drop=FALSE],
by=list(participant.id=WL.data$participant.id, x3.letters=WL.data$x3.letters),
FUN=sum)
subset.10L
# Re-format the data set "subset.10" changing the format from Long to Wide.
subset.10W <- data.frame(subset.10L[1:10, ], subset.10L[11:20, ])
subset.10W
mean(subset.10W$x6)
mean(subset.10W$x6.1)
#########################################################################################
#########################################################################################
############### Long to Wide and Wide to Long ###############
# The WL.data file is originally in long format.
head(WL.data)
# Creating a new data file which removes the unnecessary columns between x1.letters and x3.letters.
WL.data.1 <- WL.data[ , -c(3:7)]
head(WL.data.1)
###### Going from Long format to Wide format.
# The following creates a data.frame in which each row is one participant with each x4, x5, x6 score
# for that participant at each x1.numbers. Recall, each participant has 10 values of x1.numbers (and
# therefore, 10 x1.letters); so, each participant in the newly created data frame has 10 columns for x4
# scores, 10 columns for x5 scores, and 10 columns for x6 scores. This preserves all the data from the
# original "WL.data.1" file.
wide.x1 <- reshape(WL.data.1, v.names = c("x4","x5","x6"), idvar = "participant.id",
timevar = "x1.numbers", direction = "wide")
head(wide.x1)
# The 'wide.x1' data file now allows us to deal with the x4, x5, or x6 scores for each participant, by
# each value of x1.numbers we want. For instance, if we wanted the difference in x5 scores
# between the first and last values of x2 for each participant; then we would use the following; which
# returns a vector of 10 values, one value for each participant which represents the difference left from
# subtracting each participant's x5 score (at time 1) from each participant's x5 score (at time 10).
diff <- wide.x1$x5.1 - wide.x1$x5.10
diff
##### Going from Wide format to Long format.
# Going from wide to long is easy because all the attributes of the data are still available
# from the first reshape.
long.x1 <- reshape(wide.x1)
head(long.x1)
# Original 'WL.data' example.
wide2.x1 <- reshape(WL.data, v.names = c("x4","x5","x6"), drop = c("x1.letters","x2.numbers","x2.letters",
"x3.numbers","x3.letters"),idvar = "participant.id", timevar = "x1.numbers", direction = "wide")
head(wide2.x1)
long2.x1 <- reshape(wide2.x1)
head(long2.x1)
#
# For a better understanding of identifying or extracting elements of a vector, matrix, or list;
# see the section "Slicing and extracting data" in the R Reference Card (Short, 2004) posted on the web site.
# For a better understanding of the 'reshape' function and
# all of its arguments:
help(reshape)
# It may also be helpful to consult the doBy package.
library(doBy)
help(doBy)