Home > front end >  dplyr solution for creating T/F column if value present in other columns in R
dplyr solution for creating T/F column if value present in other columns in R

Time:09-27

I'm trying to create a binary T/F column, which is T if a 1 is present in that particular row of the dataframe.

df <- tibble(
  d1 = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
  d2 = c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0),
  d3 = c(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
  d4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0),
  d5 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
  d6 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
  d7 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0),
  d8 = c(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0)
)

I can use the following approach to achieve what I want:

df$d9 <- NA
df$d9[df$d1 == 1] <- TRUE
df$d9[df$d2 == 1] <- TRUE
df$d9[df$d3 == 1] <- TRUE
df$d9[df$d4 == 1] <- TRUE
df$d9[df$d5 == 1] <- TRUE
df$d9[df$d6 == 1] <- TRUE
df$d9[df$d7 == 1] <- TRUE
df$d9[df$d8 == 1] <- TRUE

Which results in:

     d1    d2    d3    d4    d5    d6    d7    d8 d9   
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
 1     1     0     0     0     0     0     0     0 TRUE 
 2     0     0     0     0     0     0     0     0 NA   
 3     0     0     1     0     0     0     0     0 TRUE 
 4     0     0     0     0     0     0     0     0 NA   
 5     0     0     0     0     0     0     0     0 NA   
 6     0     0     0     0     0     0     0     0 NA   
 7     0     0     0     0     0     0     0     1 TRUE 
 8     0     1     0     0     0     0     0     0 TRUE 
 9     0     0     0     0     0     0     0     0 NA   
10     0     0     0     0     0     0     0     0 NA   
11     0     0     0     1     0     0     1     0 TRUE 
12     0     0     0     0     0     0     0     0 NA   
13     0     0     0     0     0     0     0     0 NA   
14     0     0     0     0     0     0     0     0 NA   
15     0     0     0     0     0     0     0     0 NA   

But I am sure there must be a more elegant solution to this problem.

First, I would like to be able to create a variable containing the column names that need to be checked i.e. dfCols <- c("d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8") and call this when checking, rather than explicitly calling each of 'd1', 'd2', etc.

I also feel there must be a tidyverse/dplyr solution out there, but have struggled to find one.

Thanks!

CodePudding user response:

base R

df$d9 <- rowSums(df == 1L) > 0
df
# # A tibble: 15 x 9
#       d1    d2    d3    d4    d5    d6    d7    d8 d9   
#    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
#  1     1     0     0     0     0     0     0     0 TRUE 
#  2     0     0     0     0     0     0     0     0 FALSE
#  3     0     0     1     0     0     0     0     0 TRUE 
#  4     0     0     0     0     0     0     0     0 FALSE
#  5     0     0     0     0     0     0     0     0 FALSE
#  6     0     0     0     0     0     0     0     0 FALSE
#  7     0     0     0     0     0     0     0     1 TRUE 
#  8     0     1     0     0     0     0     0     0 TRUE 
#  9     0     0     0     0     0     0     0     0 FALSE
# 10     0     0     0     0     0     0     0     0 FALSE
# 11     0     0     0     1     0     0     1     0 TRUE 
# 12     0     0     0     0     0     0     0     0 FALSE
# 13     0     0     0     0     0     0     0     0 FALSE
# 14     0     0     0     0     0     0     0     0 FALSE
# 15     0     0     0     0     0     0     0     0 FALSE

(The comparison can be df == 1L or df != 0.)

dplyr

Really, rowSums is the best, we can put it in a mutate:

df %>%
  mutate(d9 = rowSums(cur_data() == 1L) > 0)

CodePudding user response:

creditModel library function rowAny:

Try rowAny function from creditModel library:

library(creditmodel)
df$d9 <- rowAny(df == 1)
df

Output:

      d1    d2    d3    d4    d5    d6    d7    d8 d9   
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
 1     1     0     0     0     0     0     0     0 TRUE 
 2     0     0     0     0     0     0     0     0 FALSE
 3     0     0     1     0     0     0     0     0 TRUE 
 4     0     0     0     0     0     0     0     0 FALSE
 5     0     0     0     0     0     0     0     0 FALSE
 6     0     0     0     0     0     0     0     0 FALSE
 7     0     0     0     0     0     0     0     1 TRUE 
 8     0     1     0     0     0     0     0     0 TRUE 
 9     0     0     0     0     0     0     0     0 FALSE
10     0     0     0     0     0     0     0     0 FALSE
11     0     0     0     1     0     0     1     0 TRUE 
12     0     0     0     0     0     0     0     0 FALSE
13     0     0     0     0     0     0     0     0 FALSE
14     0     0     0     0     0     0     0     0 FALSE
15     0     0     0     0     0     0     0     0 FALSE

If you want NA for FALSE values, try replace:

library(creditmodel)
x <- rowAny(df == 1)
df$d9 <- replace(x, x == FALSE, NA)
df

Output:

      d1    d2    d3    d4    d5    d6    d7    d8 d9   
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
 1     1     0     0     0     0     0     0     0 TRUE 
 2     0     0     0     0     0     0     0     0 NA   
 3     0     0     1     0     0     0     0     0 TRUE 
 4     0     0     0     0     0     0     0     0 NA   
 5     0     0     0     0     0     0     0     0 NA   
 6     0     0     0     0     0     0     0     0 NA   
 7     0     0     0     0     0     0     0     1 TRUE 
 8     0     1     0     0     0     0     0     0 TRUE 
 9     0     0     0     0     0     0     0     0 NA   
10     0     0     0     0     0     0     0     0 NA   
11     0     0     0     1     0     0     1     0 TRUE 
12     0     0     0     0     0     0     0     0 NA   
13     0     0     0     0     0     0     0     0 NA   
14     0     0     0     0     0     0     0     0 NA   
15     0     0     0     0     0     0     0     0 NA   

CodePudding user response:

Here a dplyr way:

library(dplyr)
df %>% 
  rowwise() %>% 
  mutate(d9 = paste0(names(.)[c_across(everything()) == 1], collapse = ' '),
         d9 = na_if(d9, ""),
         d9 = ifelse(is.na(d9), FALSE, TRUE))

output:

      d1    d2    d3    d4    d5    d6    d7    d8 d9   
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
 1     1     0     0     0     0     0     0     0 TRUE 
 2     0     0     0     0     0     0     0     0 FALSE
 3     0     0     1     0     0     0     0     0 TRUE 
 4     0     0     0     0     0     0     0     0 FALSE
 5     0     0     0     0     0     0     0     0 FALSE
 6     0     0     0     0     0     0     0     0 FALSE
 7     0     0     0     0     0     0     0     1 TRUE 
 8     0     1     0     0     0     0     0     0 TRUE 
 9     0     0     0     0     0     0     0     0 FALSE
10     0     0     0     0     0     0     0     0 FALSE
11     0     0     0     1     0     0     1     0 TRUE 
12     0     0     0     0     0     0     0     0 FALSE
13     0     0     0     0     0     0     0     0 FALSE
14     0     0     0     0     0     0     0     0 FALSE
15     0     0     0     0     0     0     0     0 FALSE

CodePudding user response:

I don't knovv a dplyr solution but I think that the follovving is better:

df$d9 <- rowSums(df) > 0

EDIT

A dplyr way:

df %>% rowwise() %>% mutate(d9 = any(cur_data() == 1))

Another one:

df %>% mutate(d9 = if_any(fns = ~any(.x == 1)))

Benchmarks

library(dplyr)
library(microbenchmark)

nrows <- 20
ncols <- 20
set.seed(666)
M <- matrix(
  sample.int(2, size = nrows*ncols, replace = TRUE, prob = c(0.95, 0.05)) - 1L, 
  nrow = nrows, ncol = ncols
)
rowSums(M)

df <- as_tibble(M)

microbenchmark(
  rowSums = {
    df$V21 <- rowSums(df) > 0L
  },
  dplyr_rowSums = {
    df %>% mutate(V21 = rowSums(cur_data() == 1L) > 0L)
  },
  dplyr_rowwise_any = {
    df %>% rowwise() %>% mutate(V21 = any(cur_data() == 1L))
  },
  dplyr_ifany = {
    df %>% mutate(V21 = if_any(fns = ~any(.x == 1L)))
  },
  times = 20
)

# Unit: microseconds
#               expr       min        lq       mean     median         uq       max neval  cld
#            rowSums   333.701   399.651   501.8161   503.6505   585.8005   743.601    20 a   
#      dplyr_rowSums  3387.401  3813.851  4353.3061  4163.6010  4879.6020  5600.700    20  b  
#  dplyr_rowwise_any 15224.700 17611.251 18966.8210 18589.3515 19717.5515 24586.001    20    d
#        dplyr_ifany  4679.800  4945.251  6108.6458  5734.1510  6517.7505 11656.401    20   c 

CodePudding user response:

Using if_any in dplyr -

library(dplyr)
df %>% mutate(d9 = if_any(.fns = ~.x == 1))

#      d1    d2    d3    d4    d5    d6    d7    d8 d9   
#   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
# 1     1     0     0     0     0     0     0     0 TRUE 
# 2     0     0     0     0     0     0     0     0 FALSE
# 3     0     0     1     0     0     0     0     0 TRUE 
# 4     0     0     0     0     0     0     0     0 FALSE
# 5     0     0     0     0     0     0     0     0 FALSE
# 6     0     0     0     0     0     0     0     0 FALSE
# 7     0     0     0     0     0     0     0     1 TRUE 
# 8     0     1     0     0     0     0     0     0 TRUE 
# 9     0     0     0     0     0     0     0     0 FALSE
#10     0     0     0     0     0     0     0     0 FALSE
#11     0     0     0     1     0     0     1     0 TRUE 
#12     0     0     0     0     0     0     0     0 FALSE
#13     0     0     0     0     0     0     0     0 FALSE
#14     0     0     0     0     0     0     0     0 FALSE
#15     0     0     0     0     0     0     0     0 FALSE
  • Related