Dummy coding syntax (one hot coding question)-CodePudding

I have sample data that looks like this:

id <- c("1a","2c","3d","4f","5g","6e","7f","8q","9r","10v","11x","12l")
O <- c(1,1,0,1,1,0,0,1,0,1,0,1)
dg1 <- c("A02","A84","B12","C94","D37","D12","D68","E12","F48","H12","Z83","")
dg2 <- c("B18","N34","A02","M01","B12","J02","K52","","I10","","","B18")
df <- cbind.data.frame(id,O,dg1,dg2)

I am trying to get a data frame that looks like this so that I can do a univariate logistic regression on O against each variable.

A02 <- c(1,0,1,0,0,0,0,0,0,0,0,0)
A84 <- c(0,1,0,0,0,0,0,0,0,0,0,0)
B12 <- c(0,0,1,0,1,0,0,0,0,0,0,0)
B18 <- c(1,0,0,0,0,0,0,0,0,0,0,1)
C94 <- c(0,0,0,1,0,0,0,0,0,0,0,0)
D12 <- c(0,0,0,0,0,1,0,0,0,0,0,0)
D37 <- c(0,0,0,0,1,0,0,0,0,0,0,0)
D68 <- c(0,0,0,0,0,0,1,0,0,0,0,0)
E12 <- c(0,0,0,0,0,0,0,1,0,0,0,0)
F48 <- c(0,0,0,0,0,0,0,0,1,0,0,0)
H12 <- c(0,0,0,0,0,0,0,0,0,1,0,0)
I10 <- c(0,0,0,0,0,0,0,0,1,0,0,0)
J02 <- c(0,0,0,0,0,1,0,0,0,0,0,0)
K52 <- c(0,0,0,0,0,0,1,0,0,0,0,0)
M01 <- c(0,0,0,1,0,0,0,0,0,0,0,0)
N34 <- c(0,1,0,0,0,0,0,0,0,0,0,0)
Z83 <- c(0,0,0,0,0,0,0,0,0,0,1,0)

df <- cbind.data.frame(df,A02,A84,B12,B18,C94,D12,D37,D68,E12,F48,H12,I10,J02,K52,M01,N34,Z83)

I've attempted to follow the code here and here but ran into issues that I wasn't sure how to fix. Can anyone point out my mistake/misunderstanding? I would prefer to have a solution in dplyr or base, but really willing to try anything.

Attempts:

dumbo <- model.matrix(id ~ dg1 dg2,df)

library(recipes)
dumber <- df %>% recipe(id ~ .) %>%
  step_dummy(dg1:dg2,
             one_hot = TRUE)  %>% 
  prep() %>% bake(new_data=NULL)

CodePudding user response：

library(tidyverse)


df %>% 
  left_join(
  df %>% 
    pivot_longer(c(dg1, dg2)) %>% 
    filter(value != "") %>% 
    pivot_wider(c(id, O), names_from = value) %>% 
    mutate(across(c(A02:Z83), ~if_else(is.na(.x), 0, 1)))
  )

Joining, by = c("id", "O")
    id O dg1 dg2 A02 B18 A84 N34 B12 C94 M01 D37 D12 J02 D68 K52 E12 F48 I10 H12 Z83
1   1a 1 A02 B18   1   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
2   2c 1 A84 N34   0   0   1   1   0   0   0   0   0   0   0   0   0   0   0   0   0
3   3d 0 B12 A02   1   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0
4   4f 1 C94 M01   0   0   0   0   0   1   1   0   0   0   0   0   0   0   0   0   0
5   5g 1 D37 B12   0   0   0   0   1   0   0   1   0   0   0   0   0   0   0   0   0
6   6e 0 D12 J02   0   0   0   0   0   0   0   0   1   1   0   0   0   0   0   0   0
7   7f 0 D68 K52   0   0   0   0   0   0   0   0   0   0   1   1   0   0   0   0   0
8   8q 1 E12       0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
9   9r 0 F48 I10   0   0   0   0   0   0   0   0   0   0   0   0   0   1   1   0   0
10 10v 1 H12       0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
11 11x 0 Z83       0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1
12 12l 1     B18   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0

CodePudding user response：

I have a package {dplyover} on github which helps to solve this kind of problems without data rectangling (pivoting). To make it work we first need to convert the empty cells "" into NAs. Then we can use dplyover::dist_values to get the unique values without NAs and loop over them to create new columns. We need to do this rowwise, since the values can be either in dg1 or dg2.

library(dplyr)
library(dplyover) # https://timteafan.github.io/dplyover/

df %>% 
  mutate(across(c(dg1, dg2), ~ ifelse(.x == "", NA_character_, .x))) %>% 
  rowwise %>% 
  mutate(over(dist_values(c(.$dg1, .$dg2)),
              ~ifelse(.x %in% c(dg1, dg2), 1, 0)))

#> # A tibble: 12 x 21
#> # Rowwise: 
#>    id        O dg1   dg2     A02   A84   B12   B18   C94   D12   D37   D68   E12
#>    <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1 1a        1 A02   B18       1     0     0     1     0     0     0     0     0
#>  2 2c        1 A84   N34       0     1     0     0     0     0     0     0     0
#>  3 3d        0 B12   A02       1     0     1     0     0     0     0     0     0
#>  4 4f        1 C94   M01       0     0     0     0     1     0     0     0     0
#>  5 5g        1 D37   B12       0     0     1     0     0     0     1     0     0
#>  6 6e        0 D12   J02       0     0     0     0     0     1     0     0     0
#>  7 7f        0 D68   K52       0     0     0     0     0     0     0     1     0
#>  8 8q        1 E12   <NA>      0     0     0     0     0     0     0     0     1
#>  9 9r        0 F48   I10       0     0     0     0     0     0     0     0     0
#> 10 10v       1 H12   <NA>      0     0     0     0     0     0     0     0     0
#> 11 11x       0 Z83   <NA>      0     0     0     0     0     0     0     0     0
#> 12 12l       1 <NA>  B18       0     0     0     1     0     0     0     0     0
#> # … with 8 more variables: F48 <dbl>, H12 <dbl>, I10 <dbl>, J02 <dbl>,
#> #   K52 <dbl>, M01 <dbl>, N34 <dbl>, Z83 <dbl>

^{Created on 2022-03-24 by the reprex package (v0.3.0)}