Reprex:
library(dplyr)
library(tibble)
test <- tribble(~id_group, ~id_member, ~txt_member, ~id_component,
'A', 1, 'aa', 40,
'A', 2, 'ff', 30,
'A', 3, 'cc', 30,
'B', 1, 'dd', 35,
'B', 2, 'gg', 35,
'B', 3, 'aa', 30,
'C', 1, 'aa', 40,
'C', 2, 'cc', 30,
'C', 3, 'ff', 30,
'D', 1, 'dd', 40,
'D', 2, 'gg', 40,
'D', 3, 'aa', 20,
)
So, groups A and C in the data frame above are actually 'the same' in that they comprise 40% aa, 30% cc, 30%% ff. I've been detecting issues like this with string concatenation:
test %>%
dplyr::select(-id_member) %>%
group_by(id_group) %>%
dplyr::arrange(txt_member, id_component) %>%
summarise(signal = toString(sort(c(txt_member, id_component)))) %>%
ungroup() %>%
group_by(signal) %>%
summarise(duplicates = toString(id_group))
Results:
# A tibble: 3 × 2
signal duplicates
<chr> <chr>
1 20, 40, 40, aa, dd, gg D
2 30, 30, 40, aa, cc, ff A, C
3 30, 35, 35, aa, dd, gg B
but this seems...suboptimal. Is there a better way within tidyverse?
CodePudding user response:
Here is a solution faster by orders of magnitude but not within the tidyverse. It uses split
and combn
to pair groups of id_group
and to determine which are equal in the sense of the question. Unlike the question's pipe, it returns a list with only the duplicate groups.
I have added another test case with two duplicate groups. Date set test2
has an extra group copy&pasted from group D
then changed to E
.
There is also a larger test case to see if the combn
solution scales up. As expected, the difference to the tidyverse solution is smaller but split/combn
is still 20x faster.
library(tibble)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
test <- tribble(~id_group, ~id_member, ~txt_member, ~id_component,
'A', 1, 'aa', 40,
'A', 2, 'ff', 30,
'A', 3, 'cc', 30,
'B', 1, 'dd', 35,
'B', 2, 'gg', 35,
'B', 3, 'aa', 30,
'C', 1, 'aa', 40,
'C', 2, 'cc', 30,
'C', 3, 'ff', 30,
'D', 1, 'dd', 40,
'D', 2, 'gg', 40,
'D', 3, 'aa', 20,
)
test2 <- rbind(
test,
tribble(
~id_group, ~id_member, ~txt_member, ~id_component,
'E', 1, 'dd', 40,
'E', 2, 'gg', 40,
'E', 3, 'aa', 20,
)
)
dup_signal <- function(x) {
sp <- split(x, x$id_group)
inx <- combn(sp, 2, FUN = \(y) {
nrow(y[[1]]) == nrow(y[[2]]) &&
all(y[[1]]$txt_member %in% y[[2]]$txt_member) &&
all(y[[1]]$id_component %in% y[[2]]$id_component)
})
jnx <- combn(seq_along(sp), 2, simplify = FALSE)[inx]
dups <- lapply(jnx, \(j) names(sp)[j])
dups
}
f <- function(X) {
X %>%
dplyr::select(-id_member) %>%
group_by(id_group) %>%
dplyr::arrange(txt_member, id_component) %>%
summarise(signal = toString(sort(c(txt_member, id_component)))) %>%
ungroup() %>%
group_by(signal) %>%
summarise(duplicates = toString(id_group))
}
dup_signal(test)
#> [[1]]
#> [1] "A" "C"
dup_signal(test2)
#> [[1]]
#> [1] "A" "C"
#>
#> [[2]]
#> [1] "D" "E"
f(test)
#> # A tibble: 3 × 2
#> signal duplicates
#> <chr> <chr>
#> 1 20, 40, 40, aa, dd, gg D
#> 2 30, 30, 40, aa, cc, ff A, C
#> 3 30, 35, 35, aa, dd, gg B
f(test2)
#> # A tibble: 3 × 2
#> signal duplicates
#> <chr> <chr>
#> 1 20, 40, 40, aa, dd, gg D, E
#> 2 30, 30, 40, aa, cc, ff A, C
#> 3 30, 35, 35, aa, dd, gg B
library(microbenchmark)
mb <- microbenchmark(
obrl_soil = f(test),
rui = dup_signal(test)
)
print(mb, order = "median")
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> rui 307.801 385.2015 522.5801 444.0505 538.05 3962.201 100
#> obrl_soil 10078.301 12020.7510 13761.0789 13529.7010 15193.45 21264.801 100
#> cld
#> a
#> b
test3 <- test
for(i in 1:10) test3 <- rbind(test3, test3)
dim(test3)
#> [1] 12288 4
mb <- microbenchmark(
obrl_soil = f(test3),
rui = dup_signal(test3)
)
print(mb, order = "median")
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> rui 1.2166 1.795901 2.378632 1.973001 2.269551 9.004801 100 a
#> obrl_soil 38.7548 41.701101 44.474624 43.976451 46.462601 55.697001 100 b
Created on 2022-07-12 by the reprex package (v2.0.1)
CodePudding user response:
An alternative tidyverse approach could be:
(ungroup
at the end if you need to.)
library(tidyverse)
# Sample data
test <- tribble(
~id_group, ~id_member, ~txt_member, ~id_component,
"A", 1, "aa", 40,
"A", 2, "ff", 30,
"A", 3, "cc", 30,
"B", 1, "dd", 35,
"B", 2, "gg", 35,
"B", 3, "aa", 30,
"C", 1, "aa", 40,
"C", 2, "cc", 30,
"C", 3, "ff", 30,
"D", 1, "dd", 40,
"D", 2, "gg", 40,
"D", 3, "aa", 20,
)
# Code
test |>
select(-id_member) |>
arrange(id_group, txt_member) |>
group_by(id_group) |>
summarise(across(everything(), ~ str_c(., collapse = "|"))) |>
group_by(txt_member, id_component) |>
summarise(groups = str_c(id_group, collapse = "|"))
#> # A tibble: 3 × 3
#> # Groups: txt_member [2]
#> txt_member id_component groups
#> <chr> <chr> <chr>
#> 1 aa|cc|ff 40|30|30 A|C
#> 2 aa|dd|gg 20|40|40 D
#> 3 aa|dd|gg 30|35|35 B
Created on 2022-07-12 by the reprex package (v2.0.1)