Apply a function to each group-CodePudding

I have this dataset:

A<- c(10,20,10,31,51,1,60,1,02,0,12,0,20,1,0,0,0,0,1,0,1,1,1)
B<- c(1,0,0,1,1,1,0,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0)
C<- c(1,0,0,1,1,1,0,1,1,0,1,1,0,0,0,1,0,0,0,0,0,0,1)
SUB <- c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2)
dat <- as.data.frame(cbind(SUB,B,A,C))

I wrote a function calculating the cor among A/B, B/C, C/A.

Z <- function(a,b,c) {
  cor1 = cor(a,b)
  cor2 = cor(b,c)
  cor3 = cor(c,a)
  
  x = c(cor1,cor2,cor3)
  
  return(x)
}

if I type

Z(dat$A, dat$B,dat$C)

I get the vector of results:

> [1] 0.11294312 0.91417410 0.06457059

I need to condition my function to the SUB variable and get a matrix whose rows are the cor among A/B, B/C, C/A for each SUB.

For instance:

        A/B       B/C        C/A
SUB1 0.11294312 0.91417410 0.06457059
SUB2 0.10335312 0.96744677 0.16356059

Thank you, Best regards

CodePudding user response：

base R

You can split with by and then recombine.

do.call(rbind, by(dat, dat$SUB, function(x) Z(x$A, x$B, x$C)))
#         [,1]      [,2]        [,3]
# 1 -0.1534126 1.0000000 -0.15341258
# 2  0.1081781 0.8215838  0.04608456

The row names 1 and 2 are the SUB values themselves; if SUB is more "interesting" than counting numbers, it will be more apparent. Column names can be applied trivially.

dplyr

library(dplyr)
dat %>%
  group_by(SUB) %>%
  summarize(as.data.frame(matrix(Z(A, B, C), nr = 1)))
# # A tibble: 2 x 4
#     SUB     V1    V2      V3
#   <dbl>  <dbl> <dbl>   <dbl>
# 1     1 -0.153 1.00  -0.153 
# 2     2  0.108 0.822  0.0461

CodePudding user response：

Try split in combination with sapply

sapply( split(dat,dat$SUB), function(x) Z(x["A"],x["B"],x["C"]) )
              1          2
[1,] -0.1534126 0.10817808
[2,]  1.0000000 0.82158384
[3,] -0.1534126 0.04608456

CodePudding user response：

Actually there's no need for your function if you use the upper.tri of the correlation matrix. Recently you can do this very easily by piping:

sapply(unique(dat$SUB), \(i) cor(dat[dat$SUB == i, -1]) |> {\(x) x[upper.tri(x)]}())
#             [,1]       [,2]
# [1,] -0.1534126 0.10817808
# [2,]  1.0000000 0.82158384
# [3,] -0.1534126 0.04608456

R.version.string
# [1] "R version 4.1.2 (2021-11-01)"

Data

dat <- structure(list(SUB = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 
2, 2, 2, 2, 2, 2, 2, 2, 2, 2), B = c(1, 0, 0, 1, 1, 1, 0, 1, 
1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), A = c(10, 20, 10, 
31, 51, 1, 60, 1, 2, 0, 12, 0, 20, 1, 0, 0, 0, 0, 1, 0, 1, 1, 
1), C = c(1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 
0, 0, 0, 0, 0, 1)), class = "data.frame", row.names = c(NA, -23L
))

CodePudding user response：

This is a lengthy answer, but it should be pretty flexible.

library(tidyverse)

cor.by.group.combos <- function(.data, groups, vars){
  by <-  gsub(x = rlang::quo_get_expr(enquo(groups)), pattern = "\\((.*)?\\)", replacement = "\\1")[-1]
  
  piv <- gsub(x = rlang::quo_get_expr(enquo(vars)), pattern = "\\((.*)?\\)", replacement = "\\1")[-1]
  
  .data %>%
    group_by(!!!groups) %>%
    group_split() %>%
    map(.,
      ~pivot_longer(., cols = all_of(piv), names_to = "name", values_to = "val") %>%
        nest(data = val) %>%
        full_join(.,.,by = by) %>%
        filter(name.x != name.y) %>%
        mutate(test = paste(name.x, "vs",name.y, sep = "."),
               grp = paste0(by,!!!groups),
               cor = map2_dbl(data.x,data.y, ~cor(unlist(.x), unlist(.y)))) %>%
        select(test,grp, cor)
    ) %>%
  bind_rows() %>%
    pivot_wider(names_from = test, values_from = cor)
}

cor.by.group.combos(dat, vars(SUB), vars(A, B, C))
#> # A tibble: 2 x 7
#>   grp   A.vs.B  A.vs.C B.vs.A B.vs.C  C.vs.A C.vs.B
#>   <chr>  <dbl>   <dbl>  <dbl>  <dbl>   <dbl>  <dbl>
#> 1 SUB1  -0.153 -0.153  -0.153  1     -0.153   1    
#> 2 SUB2   0.108  0.0461  0.108  0.822  0.0461  0.822

In essence, what we are doing is splitting the data by group, and then applying a cor test to every combination of the selected variables. The way I set this up will give some duplicate tests (e.g., A.vs.B and B.vs.A). You could fix this by using combn instead of full_join, but I didn't take the time to work out the details. This function should work if you change the input variables, the grouping variables, ect. You can also apply multiple groups with this method.