I'm trying to build a data set with a long type of structure, with 2 between-subject variables and 2 within-subject variables from an excel table. The current dataset structure is the following:
> str(Subset_0)
'data.frame': 54 obs. of 11 variables:
$ Subject : num 1 2 3 4 5 6 7 8 9 10 ...
$ BETWEEN1: num 1 1 1 2 2 2 2 1 1 2 ...
$ BETWEEN2: num 1 1 2 2 2 2 1 1 1 1 ...
$ A_x1 : num 5 1 3 1 0 6 1 2 7 1 ...
$ B_x2 : num 5 1 3 0 3 0 0 2 6 1 ...
$ C_y1 : num 6 9 9 2 2 4 2 2 6 0 ...
$ D_y2 : num 6 15 4 1 2 4 3 1 3 0 ...
$ K_x1 : num 5 1 3 1 0 6 1 2 7 1 ...
$ L_x2 : num 5 1 3 0 3 0 0 2 6 1 ...
$ M_y1 : num 6 9 9 2 2 4 2 2 6 14 ...
$ N_y2 : num 3 1 0 4 0 5 6 5 17 21 ...
data file from dput
:
structure(list(Subject = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55), BETWEEN1 = c(1,
1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1,
2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), BETWEEN2 = c(1,
1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), A_x1 = c(5,
1, 3, 1, 0, 6, 1, 2, 7, 1, 1, 0, 0, 2, 0, 8, NA, NA, NA, NA,
14, 23, 19, 10, 9, 10, 11, 14, 16, 8, 24, 17, 8, 22, 14, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), B_x2 = c(5, 1, 3, 0, 3, 0, 0, 2, 6, 1, 0, 0, 0, 0, 1,
7, 14, 23, 19, 10, 14, 29, 15, 7, 13, 16, 7, 9, 17, 6, 7, 16,
6, 11, 13, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), C_y1 = c(6, 9, 9, 2, 2, 4, 2, 2, 6,
0, 6, 0, 1, 10, 3, 8, 14, 29, 15, 7, 17, 21, 24, 7, 32, 31, 31,
21, 27, 29, 18, 27, 33, 23, 28, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), D_y2 = c(6, 15,
4, 1, 2, 4, 3, 1, 3, 0, 0, 0, 2, 2, 2, 5, 17, 21, 24, 7, 24,
16, 28, 7, 28, 23, 25, 25, 24, 28, 33, 27, 31, 33, 21, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), K_x1 = c(5, 1, 3, 1, 0, 6, 1, 2, 7, 1, 1, 0, 0, 2, 0, 8,
24, 16, 28, 7, 24, 31, 31, 13, 32, 35, 32, 22, 29, 32, 32, 29,
34, 32, 34, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), L_x2 = c(5, 1, 3, 0, 3, 0, 0, 2, 6,
1, 0, 0, 0, 0, 1, 7, 24, 31, 31, 13, 30, 30, 34, 12, 31, 27,
23, 25, 33, 28, 31, 29, 30, 36, 24, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), M_y1 = c(6,
9, 9, 2, 2, 4, 2, 2, 6, 14, 23, 19, 10, 9, 10, 11, 14, 16, 8,
24, 17, 8, 22, 14, 33, 28, 31, 14, 23, 19, 10, 9, 10, 11, 14,
16, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA), N_y2 = c(3, 1, 0, 4, 0, 5, 6, 5, 17, 21, 24, 7,
32, 31, 31, 21, NA, NA, NA, NA, 27, 29, 18, 27, NA, NA, 17, 21,
24, 7, 32, 31, 31, 21, 27, 17, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-54L))
I need to separate it by subject and per conditions: one per column with the values of A, B, C, and D in one column and call it 'First' ; and K, L, M, N in another and call it 'Second'. More than this, the x, y, _1 and _2 present in these variables represent within-subject factors, that I also need to take into account into another two columns - 'Within1' for x and y; and 'Within2' for 1 and 2. And finally, with two columns 'Between1' and 'Between2' which are the between-subject factors.
I need it to look like this:
Subject First SecondI Within2 Within2 Between1 Between2
1 Ai Ki 1 x 1 1
1 Bi Li 2 x 1 1
1 Ci Mi 1 y 1 1
1 Di Ni 2 y 1 1
2 Ai Ki 1 x 1 1
2 Bi Li 2 x 1 1
2 Ci Mi 1 y 1 1
2 Di Ni 2 y 1 1
...
I have used the reshape
function twice, once for grouping into one column, the A,B,C,D and separating the within-subject variables from it and I succeeded:
Subset_1 <-reshape(Subset_0,
varying = c("A_x1", " B_x2", "C_y1", "D_y2"),
v.names = "First",
timevar = "Within1",
times = c("A_x1", " B_x2", "C_y1", "D_y2"),
direction = "long")
# Next_Trial_Choice column
Subset_1$Within1[Subset_1$Within1== "A_x1"] <- "x"
Subset_1$Within1[Subset_1$Within1== "B_x2"] <- "x"
Subset_1$Within1[Subset_1$Within1== "C_y1"] <- "y"
Subset_1$Within1[Subset_1$Within1== "D_y2"] <- "y"
#cleaning the names - opponent column
Subset_1$Within2[Subset_1$Within2== "A_x1"] <- "1"
Subset_1$Within2[Subset_1$Within2== "B_x2"] <- "2"
Subset_1$Within2[Subset_1$Within2== "C_y1"] <- "1"
Subset_1$Within2[Subset_1$Within2== "D_y2"] <- "2"
The problem is that I need to do the same for another column ('Second') and I tried to use reshape again, as I did before, applied to the Subset1 this time. But it doesn't do what I need.
Is there a way to do this?
CodePudding user response:
here is one option with pivot_longer
. I know separated a bit too much, but it is just to remove confusion with names. You can adjust them according to your output.
library(tidyr)
df %>% pivot_longer(cols=c("A_x1", "B_x2", "C_y1", "D_y2"), names_to="first") %>%
pivot_longer(cols=c("K_x1", "L_x2", "M_y1", "N_y2"), names_to="second",values_to = "value2") %>%
separate(first, into = c("first", "Within1"), sep = "_") %>%
separate(Within1,into = c("Within1", "Within1_2"), sep = "(?<=[A-Za-z])(?=[0-9])") %>%
separate(second, into = c("second", "Within2"), sep = "_") %>%
separate(Within2,into = c("Within2", "Within2_2"), sep = "(?<=[A-Za-z])(?=[0-9])") %>%
select(-c(value, value2)) %>% distinct()
CodePudding user response:
This looks like it gets your given example result:
# pipe
library(magrittr)
# input data
dxyz <- structure(list(Subject = c(
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55
), BETWEEN1 = c(
1,
1, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1,
2, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), BETWEEN2 = c(
1,
1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), A_x1 = c(
5,
1, 3, 1, 0, 6, 1, 2, 7, 1, 1, 0, 0, 2, 0, 8, NA, NA, NA, NA,
14, 23, 19, 10, 9, 10, 11, 14, 16, 8, 24, 17, 8, 22, 14, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA
), B_x2 = c(
5, 1, 3, 0, 3, 0, 0, 2, 6, 1, 0, 0, 0, 0, 1,
7, 14, 23, 19, 10, 14, 29, 15, 7, 13, 16, 7, 9, 17, 6, 7, 16,
6, 11, 13, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA
), C_y1 = c(
6, 9, 9, 2, 2, 4, 2, 2, 6,
0, 6, 0, 1, 10, 3, 8, 14, 29, 15, 7, 17, 21, 24, 7, 32, 31, 31,
21, 27, 29, 18, 27, 33, 23, 28, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), D_y2 = c(
6, 15,
4, 1, 2, 4, 3, 1, 3, 0, 0, 0, 2, 2, 2, 5, 17, 21, 24, 7, 24,
16, 28, 7, 28, 23, 25, 25, 24, 28, 33, 27, 31, 33, 21, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA
), K_x1 = c(
5, 1, 3, 1, 0, 6, 1, 2, 7, 1, 1, 0, 0, 2, 0, 8,
24, 16, 28, 7, 24, 31, 31, 13, 32, 35, 32, 22, 29, 32, 32, 29,
34, 32, 34, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA
), L_x2 = c(
5, 1, 3, 0, 3, 0, 0, 2, 6,
1, 0, 0, 0, 0, 1, 7, 24, 31, 31, 13, 30, 30, 34, 12, 31, 27,
23, 25, 33, 28, 31, 29, 30, 36, 24, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), M_y1 = c(
6,
9, 9, 2, 2, 4, 2, 2, 6, 14, 23, 19, 10, 9, 10, 11, 14, 16, 8,
24, 17, 8, 22, 14, 33, 28, 31, 14, 23, 19, 10, 9, 10, 11, 14,
16, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA
), N_y2 = c(
3, 1, 0, 4, 0, 5, 6, 5, 17, 21, 24, 7,
32, 31, 31, 21, NA, NA, NA, NA, 27, 29, 18, 27, NA, NA, 17, 21,
24, 7, 32, 31, 31, 21, 27, 17, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
)), class = "data.frame", row.names = c(
NA,
-54L
))
# extract all abcd in long format with Within seperated
abcd <- dxyz %>%
tidyr::pivot_longer(-c(Subject, BETWEEN1,BETWEEN2)) %>%
tidyr::separate(col = name, sep = "_", into = c("First", "Within")) %>%
dplyr::filter(First %in% c("A", "B", "C", "D")) %>%
dplyr::mutate(
Within21 = stringr::str_extract_all(Within, "[:digit:]") %>% unlist(),
Within22 = stringr::str_extract_all(Within, "[:alpha:]") %>% unlist()
) %>%
dplyr::select(-Within)
# extract all klmn in long format with Within seperated
klmn <- dxyz %>%
tidyr::pivot_longer(-c(Subject, BETWEEN1,BETWEEN2)) %>%
tidyr::separate(col = name, sep = "_", into = c("Second", "Within")) %>%
dplyr::filter(Second %in% c("K", "L", "M", "N"))%>%
dplyr::mutate(
Within21 = stringr::str_extract_all(Within, "[:digit:]") %>% unlist(),
Within22 = stringr::str_extract_all(Within, "[:alpha:]") %>% unlist()
) %>%
dplyr::select(-Within)
# join both data sets together
abcd %>%
dplyr::left_join(
klmn,
by = c("Subject", "BETWEEN1", "BETWEEN2", "Within21", "Within22")
) %>%
dplyr::select(
Subject, First, Second, Within21, Within22, BETWEEN1, BETWEEN2, value.x, value.y
)
I seperated the reshaping into two pieces for for A, B, C, D and K, L, M, N and then joined the data together.