I have sample data that looks like this:
id <- c("1a","2c","3d","4f","5g","6e","7f","8q","9r","10v","11x","12l")
O <- c(1,1,0,1,1,0,0,1,0,1,0,1)
dg1 <- c("A02","A84","B12","C94","D37","D12","D68","E12","F48","H12","Z83","")
dg2 <- c("B18","N34","A02","M01","B12","J02","K52","","I10","","","B18")
df <- cbind.data.frame(id,O,dg1,dg2)
I am trying to get a data frame that looks like this so that I can do a univariate logistic regression on O against each variable.
A02 <- c(1,0,1,0,0,0,0,0,0,0,0,0)
A84 <- c(0,1,0,0,0,0,0,0,0,0,0,0)
B12 <- c(0,0,1,0,1,0,0,0,0,0,0,0)
B18 <- c(1,0,0,0,0,0,0,0,0,0,0,1)
C94 <- c(0,0,0,1,0,0,0,0,0,0,0,0)
D12 <- c(0,0,0,0,0,1,0,0,0,0,0,0)
D37 <- c(0,0,0,0,1,0,0,0,0,0,0,0)
D68 <- c(0,0,0,0,0,0,1,0,0,0,0,0)
E12 <- c(0,0,0,0,0,0,0,1,0,0,0,0)
F48 <- c(0,0,0,0,0,0,0,0,1,0,0,0)
H12 <- c(0,0,0,0,0,0,0,0,0,1,0,0)
I10 <- c(0,0,0,0,0,0,0,0,1,0,0,0)
J02 <- c(0,0,0,0,0,1,0,0,0,0,0,0)
K52 <- c(0,0,0,0,0,0,1,0,0,0,0,0)
M01 <- c(0,0,0,1,0,0,0,0,0,0,0,0)
N34 <- c(0,1,0,0,0,0,0,0,0,0,0,0)
Z83 <- c(0,0,0,0,0,0,0,0,0,0,1,0)
df <- cbind.data.frame(df,A02,A84,B12,B18,C94,D12,D37,D68,E12,F48,H12,I10,J02,K52,M01,N34,Z83)
I've attempted to follow the code here and here but ran into issues that I wasn't sure how to fix. Can anyone point out my mistake/misunderstanding? I would prefer to have a solution in dplyr or base, but really willing to try anything.
Attempts:
dumbo <- model.matrix(id ~ dg1 dg2,df)
library(recipes)
dumber <- df %>% recipe(id ~ .) %>%
step_dummy(dg1:dg2,
one_hot = TRUE) %>%
prep() %>% bake(new_data=NULL)
CodePudding user response:
library(tidyverse)
df %>%
left_join(
df %>%
pivot_longer(c(dg1, dg2)) %>%
filter(value != "") %>%
pivot_wider(c(id, O), names_from = value) %>%
mutate(across(c(A02:Z83), ~if_else(is.na(.x), 0, 1)))
)
Joining, by = c("id", "O")
id O dg1 dg2 A02 B18 A84 N34 B12 C94 M01 D37 D12 J02 D68 K52 E12 F48 I10 H12 Z83
1 1a 1 A02 B18 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 2c 1 A84 N34 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
3 3d 0 B12 A02 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
4 4f 1 C94 M01 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
5 5g 1 D37 B12 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0
6 6e 0 D12 J02 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0
7 7f 0 D68 K52 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0
8 8q 1 E12 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
9 9r 0 F48 I10 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0
10 10v 1 H12 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
11 11x 0 Z83 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
12 12l 1 B18 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
CodePudding user response:
I have a package {dplyover} on github which helps to solve this kind of problems without data rectangling (pivoting). To make it work we first need to convert the empty cells ""
into NA
s. Then we can use dplyover::dist_values
to get the unique values without NA
s and loop over them to create new columns. We need to do this rowwise
, since the values can be either in dg1
or dg2
.
library(dplyr)
library(dplyover) # https://timteafan.github.io/dplyover/
df %>%
mutate(across(c(dg1, dg2), ~ ifelse(.x == "", NA_character_, .x))) %>%
rowwise %>%
mutate(over(dist_values(c(.$dg1, .$dg2)),
~ifelse(.x %in% c(dg1, dg2), 1, 0)))
#> # A tibble: 12 x 21
#> # Rowwise:
#> id O dg1 dg2 A02 A84 B12 B18 C94 D12 D37 D68 E12
#> <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1a 1 A02 B18 1 0 0 1 0 0 0 0 0
#> 2 2c 1 A84 N34 0 1 0 0 0 0 0 0 0
#> 3 3d 0 B12 A02 1 0 1 0 0 0 0 0 0
#> 4 4f 1 C94 M01 0 0 0 0 1 0 0 0 0
#> 5 5g 1 D37 B12 0 0 1 0 0 0 1 0 0
#> 6 6e 0 D12 J02 0 0 0 0 0 1 0 0 0
#> 7 7f 0 D68 K52 0 0 0 0 0 0 0 1 0
#> 8 8q 1 E12 <NA> 0 0 0 0 0 0 0 0 1
#> 9 9r 0 F48 I10 0 0 0 0 0 0 0 0 0
#> 10 10v 1 H12 <NA> 0 0 0 0 0 0 0 0 0
#> 11 11x 0 Z83 <NA> 0 0 0 0 0 0 0 0 0
#> 12 12l 1 <NA> B18 0 0 0 1 0 0 0 0 0
#> # … with 8 more variables: F48 <dbl>, H12 <dbl>, I10 <dbl>, J02 <dbl>,
#> # K52 <dbl>, M01 <dbl>, N34 <dbl>, Z83 <dbl>
Created on 2022-03-24 by the reprex package (v0.3.0)