Let's say i have data frame in R that looks like this :
var2 = c(rep("A",3),rep("B",3),rep("C",3),rep("D",3),rep("E",3),rep("F",3),
rep("H",3),rep("I",3))
y2 = c(-1.23, -0.983, 1.28, -0.268, -0.46, -1.23,
1.87, 0.416, -1.99, 0.289, 1.7, -0.455,
-0.648, 0.376, -0.887,0.534,-0.679,-0.923,
0.987,0.324,-0.783,-0.679,0.326,0.998);length(y2)
group2 = c(rep(1,6),rep(2,6),rep(3,6),rep(1,6))
data2 = tibble(var2,group2,y2)
with output :
# A tibble: 24 × 3
var2 group2 y2
<chr> <dbl> <dbl>
1 A 1 -1.23
2 A 1 -0.983
3 A 1 1.28
4 B 1 -0.268
5 B 1 -0.46
6 B 1 -1.23
7 C 2 1.87
8 C 2 0.416
9 C 2 -1.99
10 D 2 0.289
11 D 2 1.7
12 D 2 -0.455
13 E 3 -0.648
14 E 3 0.376
15 E 3 -0.887
16 F 3 0.534
17 F 3 -0.679
18 F 3 -0.923
19 H 1 0.987
20 H 1 0.324
21 H 1 -0.783
22 I 1 -0.679
23 I 1 0.326
24 I 1 0.998
i want to calculate the correlation of each distinct pair in R within each group using dplyr. Ideally i want the resulted tibble to look like this (the 4th column to contain the values of each correlation pair):
which ideally must look like this :
group | var1 | var2 | value |
---|---|---|---|
1 | A | B | cor(A,B) |
1 | A | H | cor(A,H) |
1 | A | I | cor(A,I) |
1 | B | H | cor(B,H) |
1 | B | I | cor(B,I) |
1 | H | I | cor(H,I) |
2 | C | D | cor(C,D) |
3 | E | F | cor(E,F) |
How i can do that in R ? Any help ?
CodePudding user response:
if you are okay with repeating the functions you can do:
fun <- function(x, y){
a <- split(x, y)
col1 <- combn(names(a), 2, paste, collapse = '_')
col2 <- combn(unname(a), 2, do.call, what='cor')
data.frame(vars = col1, cor = col2)
}
data2 %>%
group_by(group2)%>%
summarise(fun(y2, var2), .groups = 'drop')
# A tibble: 8 x 3
# Groups: group2 [3]
group2 vars cor
<dbl> <chr> <dbl>
1 1 A_B -0.995
2 1 A_H -0.958
3 1 A_I 0.853
4 1 B_H 0.982
5 1 B_I -0.901
6 1 H_I -0.967
7 2 C_D 0.469
8 3 E_F -0.186
If you do not want to repeat the functions as the process might be expensive, you can do:
data2 %>%
group_by(group2)%>%
summarise(s=combn(split(y2, var2), 2,
\(x)stack(setNames(cor(x[[1]], x[[2]]), paste(names(x), collapse='_'))),
simplify = FALSE),.groups = 'drop') %>%
unnest(s)
# A tibble: 8 x 3
group2 values ind
<dbl> <dbl> <fct>
1 1 -0.995 A_B
2 1 -0.958 A_H
3 1 0.853 A_I
4 1 0.982 B_H
5 1 -0.901 B_I
6 1 -0.967 H_I
7 2 0.469 C_D
8 3 -0.186 E_F
CodePudding user response:
A possible solution:
library(tidyverse)
data2 %>%
group_by(group2) %>%
group_split() %>%
map(\(x) x %>% group_by(var2) %>%
group_map(~ data.frame(.x[-1]) %>% set_names(.y)) %>%
bind_cols() %>% cor %>%
{data.frame(row = rownames(.)[row(.)[upper.tri(.)]],
col = colnames(.)[col(.)[upper.tri(.)]],
corr = .[upper.tri(.)])}) %>%
imap_dfr(~ data.frame(group = .y, .x))
#> group row col corr
#> 1 1 A B -0.9949738
#> 2 1 A H -0.9581357
#> 3 1 B H 0.9819901
#> 4 1 A I 0.8533855
#> 5 1 B I -0.9012948
#> 6 1 H I -0.9669093
#> 7 2 C D 0.4690460
#> 8 3 E F -0.1864518
CodePudding user response:
Another option would be widyr::pairwise_cor
which requires to first add an identifier for the "observation":
library(widyr)
library(dplyr)
data2 %>%
group_by(var2, group2) %>%
mutate(obs = row_number()) |>
ungroup() %>%
split(.$group2) %>%
lapply(function(x) widyr::pairwise_cor(x, var2, obs, y2, upper = FALSE)) %>%
bind_rows(.id = "group2")
#> # A tibble: 8 × 4
#> group2 item1 item2 correlation
#> <chr> <chr> <chr> <dbl>
#> 1 1 A B -0.995
#> 2 1 A H -0.958
#> 3 1 B H 0.982
#> 4 1 A I 0.853
#> 5 1 B I -0.901
#> 6 1 H I -0.967
#> 7 2 C D 0.469
#> 8 3 E F -0.186