I have a dataframe with several variables (23 in my example) with binary yes/no conditions, and I am trying to identify combinations of pairs of variables
df <- tibble(V1 = sample(c(0,1), 25, replace=TRUE, prob=c(0.6, 0.4)),
V2 = sample(c(0,1), 25, replace=TRUE, prob=c(0.6, 0.4)),
V3 = sample(c(0,1), 25, replace=TRUE, prob=c(0.8, 0.2)),
V4 = sample(c(0,1), 25, replace=TRUE, prob=c(0.7, 0.3)),
V5 = sample(c(0,1), 25, replace=TRUE, prob=c(0.8, 0.2)),
V6 = sample(c(0,1), 25, replace=TRUE, prob=c(0.8, 0.2)),
V7 = sample(c(0,1), 25, replace=TRUE, prob=c(0.8, 0.2)))
If I wanted to identify every unique group in my dataframe I would use cur_group_id() like this:
df %>% group_by(across(everything())) %>%
mutate(combo_id = cur_group_id())
But what I actually want is to identify combination of pairs of yes conditions. For example, I want to identify cases where V1 == 1 & V2 == 1, ignoring what any of the other columns contain.
So basically I want to do this:
df %>%
mutate(combo_id = case_when(V1 == 1 & V2 == 1 ~ "V1_V2"))
but I want to be able to apply this across every possible 2 variable combination for all of the variables in my dataframe.
Maybe this is a job for map()? I'm stuck.
CodePudding user response:
We may use combn
out <- combn(names(df), 2, FUN = \(x)
case_when(rowSums(df[x]) == 2 ~ paste(x, collapse = "_")))
colnames(out) <- paste0("combo_id_", combn(names(df), 2,
FUN = paste, collapse = "_"))
df2 <- cbind(df, out)
-output
> head(df2, 2)
V1 V2 V3 V4 V5 V6 V7 combo_id_V1_V2 combo_id_V1_V3 combo_id_V1_V4 combo_id_V1_V5 combo_id_V1_V6 combo_id_V1_V7 combo_id_V2_V3
1 1 0 0 1 1 0 1 <NA> <NA> V1_V4 V1_V5 <NA> V1_V7 <NA>
2 1 0 0 1 0 0 0 <NA> <NA> V1_V4 <NA> <NA> <NA> <NA>
combo_id_V2_V4 combo_id_V2_V5 combo_id_V2_V6 combo_id_V2_V7 combo_id_V3_V4 combo_id_V3_V5 combo_id_V3_V6 combo_id_V3_V7
1 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
2 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
combo_id_V4_V5 combo_id_V4_V6 combo_id_V4_V7 combo_id_V5_V6 combo_id_V5_V7 combo_id_V6_V7
1 V4_V5 <NA> V4_V7 <NA> V5_V7 <NA>
2 <NA> <NA> <NA> <NA> <NA> <NA>
Or using tidyverse
library(dplyr)
library(purrr)
library(stringr)
combn(df, 2, simplify = FALSE) %>%
map_dfc(~ .x %>%
transmute(!! sprintf("combo_id_%s", str_c(names(.),
collapse = "_")) := case_when(rowSums(across(everything())) == 2
~ str_c(names(.), collapse = "_")))) %>%
bind_cols(df, .)
CodePudding user response:
Here's an approach where I make the data long, join to itself for matches between columns, and reshape wide.
library(dplyr); library(tidyr)
df_r <- df %>%
mutate(row = row_number()) %>%
pivot_longer(-row) %>%
filter(value == 1)
df_r %>%
left_join(df_r, by = "row") %>%
filter(name.x != name.y) %>%
transmute(row, combo = paste(name.x, name.y, sep = "_"), value = 1) %>%
complete(row, combo, fill = list(value = 0)) %>%
arrange(row, combo) %>%
pivot_wider(names_from = combo, values_from = value)
Result
# A tibble: 16 × 37
row V1_V2 V1_V3 V1_V4 V1_V5 V1_V6 V1_V7 V2_V1 V2_V3 V2_V4 V2_V5 V2_V6 V2_V7 V3_V1 V3_V2 V3_V4 V3_V5
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0
2 2 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 4 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
5 6 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
6 7 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0
7 8 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
8 9 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0
9 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
10 12 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
11 13 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
12 15 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0
13 16 1 0 1 0 1 0 1 0 1 0 1 0 0 0 0 0
14 19 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 1
15 20 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0
16 21 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
# … with 20 more variables: V3_V6 <dbl>, V3_V7 <dbl>, V4_V1 <dbl>, V4_V2 <dbl>, V4_V3 <dbl>, V4_V5 <dbl>,
# V4_V6 <dbl>, V5_V1 <dbl>, V5_V2 <dbl>, V5_V3 <dbl>, V5_V4 <dbl>, V5_V6 <dbl>, V6_V1 <dbl>, V6_V2 <dbl>,
# V6_V3 <dbl>, V6_V4 <dbl>, V6_V5 <dbl>, V7_V1 <dbl>, V7_V2 <dbl>, V7_V3 <dbl>
# ℹ Use `colnames()` to see all variable names