How to convert multiple binary columns into a single character column?-CodePudding

I would like to convert data frame df1 into data frame df2.

id <- c(1,2,3)
outcome_1 <- c(1,0,1)
outcome_2 <- c(1,1,0)
df1 <- data.frame(id,outcome_1,outcome_2)

id <- c(1,2,3)
outcome <- c("1,2","2","1")
df2 <- data.frame(id,outcome)

The answers to the following question almost do what I want, but in my case a row can have more than one positive outcome (e.g. first row needs to be "1,2"). Also, I would like the resulting column to be a character column.

R: Converting multiple binary columns into one factor variable whose factors are binary column names

Please kindly help. Thank you.

CodePudding user response：

Subset the substrings of the outcomes with their binary values coerced as.logical.

apply(df1[-1], 1, \(x) toString(substring(names(df1)[-1], 9)[as.logical(x)]))
# [1] "1, 2" "2"    "1"

apply(df1[-1], 1, \(x) paste(substring(names(df1)[-1], 9)[as.logical(x)], collapse=','))
# [1] "1,2" "2"   "1"

Using the first method:

cbind(df1[1], outcome=apply(df1[-1], 1, \(x) toString(substring(names(df1)[-1], 9)[as.logical(x)])))
#   id outcome
# 1  1    1, 2
# 2  2       2
# 3  3       1

If you want a nested list you may use list2DF.

l <- list2DF(c(df1[1],
               outcome=list(apply(df1[-1], 1, \(x) 
                                  as.numeric(substring(names(df1)[-1], 9))[as.logical(x)]))))
l
#   id outcome
# 1  1    1, 2
# 2  2       2
# 3  3       1

where

str(l)
# 'data.frame': 3 obs. of  2 variables:
#   $ id     : num  1 2 3
# $ outcome:List of 3
# ..$ : num  1 2
# ..$ : num 2
# ..$ : num 1

Data:

df1 <- structure(list(id = c(1, 2, 3), outcome_1 = c(1, 0, 1), outcome_2 = c(1, 
1, 0)), class = "data.frame", row.names = c(NA, -3L))

CodePudding user response：

Here is one more tidyverse approach:

library(dplyr)
library(tidyr)

df1 %>% 
  mutate(across(-id, ~case_when(. == 1 ~ cur_column()), .names = 'new_{col}'), .keep="unused") %>% 
  unite(outcome, starts_with('new'), na.rm = TRUE, sep = ', ') %>% 
  mutate(outcome = gsub('outcome_', '', outcome))

  id outcome
1  1    1, 2
2  2       2
3  3       1

CodePudding user response：

Another possible solution, based on dplyr and purrr::pmap:

library(tidyverse)

df1 %>% 
  transmute(id, outcome = pmap(., ~ c(1*..2, 2*..3) %>% .[. != 0] 
     %>% str_c(collapse = ", "))) 

#>   id outcome
#> 1  1    1, 2
#> 2  2       2
#> 3  3       1

Or simply:

library(tidyverse)

pmap_dfr(df1, ~ data.frame(id = ..1, outcome = c(1*..2, 2*..3) 
   %>% .[. != 0] %>% str_c(collapse = ", ")))

#>   id outcome
#> 1  1    1, 2
#> 2  2       2
#> 3  3       1

CodePudding user response：

How many outcome_ columns are there? If just 2, this will work fine.

library(dplyr) 

df1 %>% 
    rowwise() %>% 
    summarise(id = id, 
              outcome = paste(which(c(outcome_1,outcome_2)==1), collapse =",")) 

# A tibble: 3 x 2
     id outcome
  <dbl> <chr>  
1     1 1,2    
2     2 2      
3     3 1

If there are more than 2, try this:

df1 %>% 
    rowwise() %>% 
    summarise(id=id, 
              outcome = paste(which(c_across(-id)== 1), collapse =","))

CodePudding user response：

cbind(
  df1[,!outcome_col_idx, drop = FALSE],
  outcome = apply(
    replace(df1, df1 == 0, NA)[,outcome_col_idx],
    1,
    function(x){
      as.factor(
        toString(
          gsub(
            "outcome_", 
            "", 
            names(x)[complete.cases(x)]
          )
        )
      )
    }
  )
)