dplyr to detect groups with identical composition-CodePudding

Reprex:

library(dplyr)
library(tibble)

test <- tribble(~id_group, ~id_member, ~txt_member, ~id_component,
                      'A',          1,        'aa',            40,
                      'A',          2,        'ff',            30,
                      'A',          3,        'cc',            30,
                      'B',          1,        'dd',            35, 
                      'B',          2,        'gg',            35,
                      'B',          3,        'aa',            30,
                      'C',          1,        'aa',            40,
                      'C',          2,        'cc',            30,
                      'C',          3,        'ff',            30,
                      'D',          1,        'dd',            40, 
                      'D',          2,        'gg',            40,
                      'D',          3,        'aa',            20,
                    )

So, groups A and C in the data frame above are actually 'the same' in that they comprise 40% aa, 30% cc, 30%% ff. I've been detecting issues like this with string concatenation:

test %>% 
  dplyr::select(-id_member) %>% 
  group_by(id_group) %>% 
  dplyr::arrange(txt_member, id_component) %>% 
  summarise(signal = toString(sort(c(txt_member, id_component)))) %>% 
  ungroup() %>% 
  group_by(signal) %>% 
  summarise(duplicates = toString(id_group))

Results:

# A tibble: 3 × 2
  signal                 duplicates
  <chr>                  <chr>     
1 20, 40, 40, aa, dd, gg D         
2 30, 30, 40, aa, cc, ff A, C      
3 30, 35, 35, aa, dd, gg B

but this seems...suboptimal. Is there a better way within tidyverse?

CodePudding user response：

Here is a solution faster by orders of magnitude but not within the tidyverse. It uses split and combn to pair groups of id_group and to determine which are equal in the sense of the question. Unlike the question's pipe, it returns a list with only the duplicate groups.

I have added another test case with two duplicate groups. Date set test2 has an extra group copy&pasted from group D then changed to E.

There is also a larger test case to see if the combn solution scales up. As expected, the difference to the tidyverse solution is smaller but split/combn is still 20x faster.

library(tibble)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

test <- tribble(~id_group, ~id_member, ~txt_member, ~id_component,
                'A',          1,        'aa',            40,
                'A',          2,        'ff',            30,
                'A',          3,        'cc',            30,
                'B',          1,        'dd',            35, 
                'B',          2,        'gg',            35,
                'B',          3,        'aa',            30,
                'C',          1,        'aa',            40,
                'C',          2,        'cc',            30,
                'C',          3,        'ff',            30,
                'D',          1,        'dd',            40, 
                'D',          2,        'gg',            40,
                'D',          3,        'aa',            20,
)
test2 <- rbind(
  test,
  tribble(
    ~id_group, ~id_member, ~txt_member, ~id_component,
    'E',          1,        'dd',            40, 
    'E',          2,        'gg',            40,
    'E',          3,        'aa',            20,
  )
)


dup_signal <- function(x) {
  sp <- split(x, x$id_group)
  inx <- combn(sp, 2, FUN = \(y) {
    nrow(y[[1]]) == nrow(y[[2]]) &&
      all(y[[1]]$txt_member %in% y[[2]]$txt_member) &&
      all(y[[1]]$id_component %in% y[[2]]$id_component)
  })
  jnx <- combn(seq_along(sp), 2, simplify = FALSE)[inx]
  dups <- lapply(jnx, \(j) names(sp)[j])
  dups
}

f <- function(X) {
  X %>% 
    dplyr::select(-id_member) %>% 
    group_by(id_group) %>% 
    dplyr::arrange(txt_member, id_component) %>% 
    summarise(signal = toString(sort(c(txt_member, id_component)))) %>% 
    ungroup() %>% 
    group_by(signal) %>% 
    summarise(duplicates = toString(id_group))
}

dup_signal(test)
#> [[1]]
#> [1] "A" "C"
dup_signal(test2)
#> [[1]]
#> [1] "A" "C"
#> 
#> [[2]]
#> [1] "D" "E"

f(test)
#> # A tibble: 3 × 2
#>   signal                 duplicates
#>   <chr>                  <chr>     
#> 1 20, 40, 40, aa, dd, gg D         
#> 2 30, 30, 40, aa, cc, ff A, C      
#> 3 30, 35, 35, aa, dd, gg B
f(test2)
#> # A tibble: 3 × 2
#>   signal                 duplicates
#>   <chr>                  <chr>     
#> 1 20, 40, 40, aa, dd, gg D, E      
#> 2 30, 30, 40, aa, cc, ff A, C      
#> 3 30, 35, 35, aa, dd, gg B

library(microbenchmark)

mb <- microbenchmark(
  obrl_soil = f(test),
  rui = dup_signal(test)
)
print(mb, order = "median")
#> Unit: microseconds
#>       expr       min         lq       mean     median       uq       max neval
#>        rui   307.801   385.2015   522.5801   444.0505   538.05  3962.201   100
#>  obrl_soil 10078.301 12020.7510 13761.0789 13529.7010 15193.45 21264.801   100
#>  cld
#>   a 
#>    b

test3 <- test
for(i in 1:10) test3 <- rbind(test3, test3)
dim(test3)
#> [1] 12288     4
mb <- microbenchmark(
  obrl_soil = f(test3),
  rui = dup_signal(test3)
)
print(mb, order = "median")
#> Unit: milliseconds
#>       expr     min        lq      mean    median        uq       max neval cld
#>        rui  1.2166  1.795901  2.378632  1.973001  2.269551  9.004801   100  a 
#>  obrl_soil 38.7548 41.701101 44.474624 43.976451 46.462601 55.697001   100   b

^{Created on 2022-07-12 by the reprex package (v2.0.1)}

CodePudding user response：

An alternative tidyverse approach could be:

(ungroup at the end if you need to.)

library(tidyverse)

# Sample data
test <- tribble(
  ~id_group, ~id_member, ~txt_member, ~id_component,
  "A", 1, "aa", 40,
  "A", 2, "ff", 30,
  "A", 3, "cc", 30,
  "B", 1, "dd", 35,
  "B", 2, "gg", 35,
  "B", 3, "aa", 30,
  "C", 1, "aa", 40,
  "C", 2, "cc", 30,
  "C", 3, "ff", 30,
  "D", 1, "dd", 40,
  "D", 2, "gg", 40,
  "D", 3, "aa", 20,
)

# Code
test |> 
  select(-id_member) |> 
  arrange(id_group, txt_member) |> 
  group_by(id_group) |> 
  summarise(across(everything(), ~ str_c(., collapse = "|"))) |> 
  group_by(txt_member, id_component) |> 
  summarise(groups = str_c(id_group, collapse = "|"))

#> # A tibble: 3 × 3
#> # Groups:   txt_member [2]
#>   txt_member id_component groups
#>   <chr>      <chr>        <chr> 
#> 1 aa|cc|ff   40|30|30     A|C   
#> 2 aa|dd|gg   20|40|40     D     
#> 3 aa|dd|gg   30|35|35     B

^{Created on 2022-07-12 by the reprex package (v2.0.1)}