How to use purrr to convert factor variables to dichotomous variables-CodePudding

I would like to convert many factor variables to dichotomous variables using purrr. Here is an example of what I'm trying to accomplish, using a toy dataset with a function adapted from this answer:

library(dplyr)
library(forcats)
library(tidyr)
library(purrr)

df <- tibble(a = c(1,2,3), 
             b = c(1,1,2), 
             c = as_factor(c("Rose","Pink","Red")), 
             d = c(2,3,4),
             e = as_factor(c("Paris", "London", "Paris"))
)

fac_to_d <- function(.data, col) {
  .data %>%
    mutate(value = 1) %>% 
    pivot_wider(names_from  = {{col}},
                values_from = value,
                values_fill = 0)
}

The function works:

df %>% 
  fac_to_d("c") %>% 
  fac_to_d("e")
#> # A tibble: 3 × 8
#>       a     b     d  Rose  Pink   Red Paris London
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
#> 1     1     1     2     1     0     0     1      0
#> 2     2     1     3     0     1     0     0      1
#> 3     3     2     4     0     0     1     1      0

But I can't figure out how to make it work with purrr. For example:

cols <- c("c", "e")
df %>% map_dfr(.f = fac_to_d, col = cols)
#> Error in UseMethod("mutate"): no applicable method for 'mutate' applied to an object of class "c('double', 'numeric')"
df %>% map(.f = fac_to_d, col = cols)
#> Error in UseMethod("mutate"): no applicable method for 'mutate' applied to an object of class "c('double', 'numeric')"

How can I get this function to work with purrr? (If there is a better tidy way to convert many factor variables to dichotomous variables, I'd also be interested in learning about that!)

CodePudding user response：

I suggest using the tidymodels whenever we are dealing with features

library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(forcats)
library(tidyr)
library(purrr)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#>   method                   from   
#>   required_pkgs.model_spec parsnip


data_example <- tibble(a = c(1,2,3), 
             b = c(1,1,2), 
             c = as_factor(c("Rose","Pink","Red")), 
             d = c(2,3,4),
             e = as_factor(c("Paris", "London", "Paris"))
)

fac_to_d <- function(.data, col) {
  .data %>%
    mutate(value = 1) %>% 
    pivot_wider(names_from  = {{col}},
                values_from = value,
                values_fill = 0)
}


recipe_hot_encode <- recipe(x = data_example)



recipe_hot_encode |> 
  step_dummy(c(c,e),one_hot = TRUE) |> 
  prep() |> 
  juice()
#> # A tibble: 3 x 8
#>       a     b     d c_Rose c_Pink c_Red e_Paris e_London
#>   <dbl> <dbl> <dbl>  <dbl>  <dbl> <dbl>   <dbl>    <dbl>
#> 1     1     1     2      1      0     0       1        0
#> 2     2     1     3      0      1     0       0        1
#> 3     3     2     4      0      0     1       1        0



# if you want the old names


dummy_names2 <- function (var, lvl, ordinal = FALSE, sep = "_") 
{
  args <- vctrs::vec_recycle_common(var, lvl)
  var <- args[[1]]
  lvl <- args[[2]]
  if (!ordinal) 
    nms <- paste(make.names(lvl), sep = sep)
  else nms <- paste0(names0(length(lvl), sep))
  nms
}


recipe_hot_encode |> 
  step_dummy(c(c,e),one_hot = TRUE,naming = dummy_names2) |> 
  prep() |> 
  juice()
#> # A tibble: 3 x 8
#>       a     b     d  Rose  Pink   Red Paris London
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
#> 1     1     1     2     1     0     0     1      0
#> 2     2     1     3     0     1     0     0      1
#> 3     3     2     4     0     0     1     1      0

^{Created on 2021-10-21 by the reprex package (v2.0.1)}

CodePudding user response：

Try this:

library(purrr)
library(dplyr)
library(tidyr)

df %>% 
  mutate(c_one_hot = map(c, ~ set_names(levels(c) == .x, levels(c)))) %>% 
  unnest_wider(c_one_hot) %>% 
  mutate(e_one_hot = map(e, ~ set_names(levels(e) == .x, levels(e)))) %>% 
  unnest_wider(e_one_hot) %>% 
  mutate(across(everything(), ~.*1)) %>% 
  select(-c, -e)

output:

     a     b     d  Rose  Pink   Red Paris London
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
1     1     1     2     1     0     0     1      0
2     2     1     3     0     1     0     0      1
3     3     2     4     0     0     1     1      0

CodePudding user response：

Here's a possible approach, but it involves looping through each categorical variable and making the dummy variables separately and then binding them back to the numeric variables at the end.

I use model.matrix() to make the dummy variables with reformulate() to construct the formula. Note reformulate() has the helpful intercept argument to suppress the intercept to avoid treatment contrasts.

Here's a function for work, taking a dataset and column name as strings. I make it into a data.frame so it will work with map_df() functions:

to_dummy = function(data, col) {
    col %>%
        reformulate(intercept = FALSE) %>%
        model.matrix(data = data) %>%
        as.data.frame()
}
to_dummy(data = df, col = "e")
#>   eParis eLondon
#> 1      1       0
#> 2      0       1
#> 3      1       0

Then loop through your categorical columns from your string vector with map_dfc() (for column binding). The final step is binding the numeric columns back to the rest of the dataset, which I somewhat awkwardly do with a nested select().

cols %>%
    map_dfc(.f = to_dummy, data = df) %>%
    cbind(select(df, where(is.numeric)), .)
#>   a b d cRose cPink cRed eParis eLondon
#> 1 1 1 2     1     0    0      1       0
#> 2 2 1 3     0     1    0      0       1
#> 3 3 2 4     0     0    1      1       0

^{Created on 2021-10-21 by the reprex package (v2.0.0)}

A downside is that the column names are based on the original variable plus the factor levels, not just the factor levels.