Home > Back-end >  Identifying combinations of binary variables in tidyverse
Identifying combinations of binary variables in tidyverse

Time:01-24

I have a dataframe with several variables (23 in my example) with binary yes/no conditions, and I am trying to identify combinations of pairs of variables

df <- tibble(V1 = sample(c(0,1), 25, replace=TRUE, prob=c(0.6, 0.4)),
             V2 = sample(c(0,1), 25, replace=TRUE, prob=c(0.6, 0.4)),
             V3 = sample(c(0,1), 25, replace=TRUE, prob=c(0.8, 0.2)),
             V4 = sample(c(0,1), 25, replace=TRUE, prob=c(0.7, 0.3)),
             V5 = sample(c(0,1), 25, replace=TRUE, prob=c(0.8, 0.2)),
             V6 = sample(c(0,1), 25, replace=TRUE, prob=c(0.8, 0.2)),
             V7 = sample(c(0,1), 25, replace=TRUE, prob=c(0.8, 0.2)))

If I wanted to identify every unique group in my dataframe I would use cur_group_id() like this:

df %>% group_by(across(everything())) %>%
  mutate(combo_id = cur_group_id())

But what I actually want is to identify combination of pairs of yes conditions. For example, I want to identify cases where V1 == 1 & V2 == 1, ignoring what any of the other columns contain.

So basically I want to do this:

df %>% 
  mutate(combo_id = case_when(V1 == 1 & V2 == 1 ~ "V1_V2"))

but I want to be able to apply this across every possible 2 variable combination for all of the variables in my dataframe.

Maybe this is a job for map()? I'm stuck.

CodePudding user response:

We may use combn

out <- combn(names(df), 2, FUN = \(x)
   case_when(rowSums(df[x]) == 2 ~ paste(x, collapse = "_")))

colnames(out) <- paste0("combo_id_", combn(names(df), 2, 
    FUN = paste, collapse = "_"))
df2 <- cbind(df, out)

-output

> head(df2, 2)
  V1 V2 V3 V4 V5 V6 V7 combo_id_V1_V2 combo_id_V1_V3 combo_id_V1_V4 combo_id_V1_V5 combo_id_V1_V6 combo_id_V1_V7 combo_id_V2_V3
1  1  0  0  1  1  0  1           <NA>           <NA>          V1_V4          V1_V5           <NA>          V1_V7           <NA>
2  1  0  0  1  0  0  0           <NA>           <NA>          V1_V4           <NA>           <NA>           <NA>           <NA>
  combo_id_V2_V4 combo_id_V2_V5 combo_id_V2_V6 combo_id_V2_V7 combo_id_V3_V4 combo_id_V3_V5 combo_id_V3_V6 combo_id_V3_V7
1           <NA>           <NA>           <NA>           <NA>           <NA>           <NA>           <NA>           <NA>
2           <NA>           <NA>           <NA>           <NA>           <NA>           <NA>           <NA>           <NA>
  combo_id_V4_V5 combo_id_V4_V6 combo_id_V4_V7 combo_id_V5_V6 combo_id_V5_V7 combo_id_V6_V7
1          V4_V5           <NA>          V4_V7           <NA>          V5_V7           <NA>
2           <NA>           <NA>           <NA>           <NA>           <NA>           <NA>

Or using tidyverse

library(dplyr)
library(purrr)
library(stringr)
combn(df, 2, simplify = FALSE) %>%
  map_dfc(~ .x %>%
   transmute(!! sprintf("combo_id_%s", str_c(names(.),
   collapse = "_")) := case_when(rowSums(across(everything())) == 2 
   ~ str_c(names(.), collapse = "_")))) %>%
  bind_cols(df, .)

CodePudding user response:

Here's an approach where I make the data long, join to itself for matches between columns, and reshape wide.

library(dplyr); library(tidyr)
df_r <- df %>%
  mutate(row = row_number()) %>%
  pivot_longer(-row) %>%
  filter(value == 1)

df_r %>%
  left_join(df_r, by = "row") %>%
  filter(name.x != name.y) %>%
  transmute(row, combo = paste(name.x, name.y, sep = "_"), value = 1) %>%
  complete(row, combo, fill = list(value = 0)) %>%
  arrange(row, combo) %>%
  pivot_wider(names_from = combo, values_from = value)

Result

# A tibble: 16 × 37
     row V1_V2 V1_V3 V1_V4 V1_V5 V1_V6 V1_V7 V2_V1 V2_V3 V2_V4 V2_V5 V2_V6 V2_V7 V3_V1 V3_V2 V3_V4 V3_V5
   <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1     1     0     0     0     0     0     0     0     0     0     1     1     0     0     0     0     0
 2     2     0     0     0     0     0     0     0     0     1     0     0     0     0     0     0     0
 3     3     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
 4     4     0     0     0     0     0     0     0     0     1     0     0     0     0     0     0     0
 5     6     1     0     0     0     0     0     1     0     0     0     0     0     0     0     0     0
 6     7     0     1     0     0     0     1     0     0     0     0     0     0     1     0     0     0
 7     8     1     0     0     0     0     0     1     0     0     0     0     0     0     0     0     0
 8     9     1     0     1     0     0     0     1     0     1     0     0     0     0     0     0     0
 9    11     0     0     0     0     0     0     0     0     0     0     0     0     0     0     1     0
10    12     1     0     0     0     0     0     1     0     0     0     0     0     0     0     0     0
11    13     0     0     0     0     0     0     0     0     0     0     0     1     0     0     0     0
12    15     0     0     0     0     0     0     0     1     0     0     0     1     0     1     0     0
13    16     1     0     1     0     1     0     1     0     1     0     1     0     0     0     0     0
14    19     0     0     0     0     0     0     0     1     1     1     1     0     0     1     1     1
15    20     0     0     0     0     0     0     0     1     0     0     0     0     0     1     0     0
16    21     0     0     0     1     0     0     0     0     0     0     0     0     0     0     0     0
# … with 20 more variables: V3_V6 <dbl>, V3_V7 <dbl>, V4_V1 <dbl>, V4_V2 <dbl>, V4_V3 <dbl>, V4_V5 <dbl>,
#   V4_V6 <dbl>, V5_V1 <dbl>, V5_V2 <dbl>, V5_V3 <dbl>, V5_V4 <dbl>, V5_V6 <dbl>, V6_V1 <dbl>, V6_V2 <dbl>,
#   V6_V3 <dbl>, V6_V4 <dbl>, V6_V5 <dbl>, V7_V1 <dbl>, V7_V2 <dbl>, V7_V3 <dbl>
# ℹ Use `colnames()` to see all variable names
  • Related