Create multiple dummy variables based on column in R-CodePudding

I have a dataframe as below, I want to create dummy columns based for each unique string available in the genres column

       tconst                   genres
 1: tt0000001        Documentary,Short
 2: tt0000002          Animation,Short
 3: tt0000003 Animation,Comedy,Romance
 4: tt0000004          Animation,Short
 5: tt0000005             Comedy,Short
 6: tt0000006                    Short
 7: tt0000007              Short,Sport
 8: tt0000008        Documentary,Short
 9: tt0000009            Romance,Short
10: tt0000010        Documentary,Short
11: tt0000011        Documentary,Short
12: tt0000012        Documentary,Short
13: tt0000013        Documentary,Short
14: tt0000014             Comedy,Short
15: tt0000015          Animation,Short

I have tried to use the code below, apart from not being efficient, it is returning incorrect output.

uniqueGenre <- MovieRating_test %>% 
  separate_rows(genres) %>% 
  pull() %>%
  unique()

for(i in 1:nrow(MovieRating_test)){
  for(j in uniqueGenre){
    MovieRating_test[i,j] <- ifelse(j %in% strsplit(as.character(MovieRating_test[,"genres"][i]),","), 1, 0)
  }
}

dataset

MovieRating_test <- structure(list(tconst = c("tt0000001", "tt0000002", "tt0000003", 
"tt0000004", "tt0000005", "tt0000006", "tt0000007", "tt0000008", 
"tt0000009", "tt0000010", "tt0000011", "tt0000012", "tt0000013", 
"tt0000014", "tt0000015"), genres = c("Documentary,Short", "Animation,Short", 
"Animation,Comedy,Romance", "Animation,Short", "Comedy,Short", 
"Short", "Short,Sport", "Documentary,Short", "Romance,Short", 
"Documentary,Short", "Documentary,Short", "Documentary,Short", 
"Documentary,Short", "Comedy,Short", "Animation,Short")), row.names = c(NA, 
-15L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000023514d61ef0>)

a data.table solution would be prefered but any solution is welcome.

CodePudding user response：

We may use dummy_cols from fastDummies

library(fastDummies)
dummy_cols(MovieRating_test, 'genres', split = ",")

-output

       tconst                   genres genres_Animation genres_Comedy genres_Romance genres_Short genres_Documentary
       <char>                   <char>            <int>         <int>          <int>        <int>              <int>
 1: tt0000001        Documentary,Short                0             0              0            1                  1
 2: tt0000002          Animation,Short                1             0              0            1                  0
 3: tt0000003 Animation,Comedy,Romance                1             1              1            0                  0
 4: tt0000004          Animation,Short                1             0              0            1                  0
 5: tt0000005             Comedy,Short                0             1              0            1                  0
 6: tt0000006                    Short                0             0              0            1                  0
 7: tt0000007              Short,Sport                0             0              0            1                  0
 8: tt0000008        Documentary,Short                0             0              0            1                  1
 9: tt0000009            Romance,Short                0             0              1            1                  0
10: tt0000010        Documentary,Short                0             0              0            1                  1
11: tt0000011        Documentary,Short                0             0              0            1                  1
12: tt0000012        Documentary,Short                0             0              0            1                  1
13: tt0000013        Documentary,Short                0             0              0            1                  1
14: tt0000014             Comedy,Short                0             1              0            1                  0
15: tt0000015          Animation,Short                1             0              0            1                  0
    genres_Sport
           <int>
 1:            0
 2:            0
 3:            0
 4:            0
 5:            0
 6:            0
 7:            1
 8:            0
 9:            0
10:            0
11:            0
12:            0
13:            0
14:            0
15:            0

Or another option with mtabulate

library(data.table)
library(qdapTools)
m1 <- MovieRating_test[,  (mtabulate(strsplit(genres, ",")) > 0)]
MovieRating_test[, colnames(m1) := as.data.frame(m1)]

-output

> MovieRating_test
       tconst                   genres Animation Comedy Documentary Romance Short Sport
       <char>                   <char>     <int>  <int>       <int>   <int> <int> <int>
 1: tt0000001        Documentary,Short         0      0           1       0     1     0
 2: tt0000002          Animation,Short         1      0           0       0     1     0
 3: tt0000003 Animation,Comedy,Romance         1      1           0       1     0     0
 4: tt0000004          Animation,Short         1      0           0       0     1     0
 5: tt0000005             Comedy,Short         0      1           0       0     1     0
 6: tt0000006                    Short         0      0           0       0     1     0
 7: tt0000007              Short,Sport         0      0           0       0     1     1
 8: tt0000008        Documentary,Short         0      0           1       0     1     0
 9: tt0000009            Romance,Short         0      0           0       1     1     0
10: tt0000010        Documentary,Short         0      0           1       0     1     0
11: tt0000011        Documentary,Short         0      0           1       0     1     0
12: tt0000012        Documentary,Short         0      0           1       0     1     0
13: tt0000013        Documentary,Short         0      0           1       0     1     0
14: tt0000014             Comedy,Short         0      1           0       0     1     0
15: tt0000015          Animation,Short         1      0           0       0     1     0