I'm trying to create a binary T/F column, which is T if a 1 is present in that particular row of the dataframe.
df <- tibble(
d1 = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
d2 = c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0),
d3 = c(0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
d4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0),
d5 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
d6 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
d7 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0),
d8 = c(0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0)
)
I can use the following approach to achieve what I want:
df$d9 <- NA
df$d9[df$d1 == 1] <- TRUE
df$d9[df$d2 == 1] <- TRUE
df$d9[df$d3 == 1] <- TRUE
df$d9[df$d4 == 1] <- TRUE
df$d9[df$d5 == 1] <- TRUE
df$d9[df$d6 == 1] <- TRUE
df$d9[df$d7 == 1] <- TRUE
df$d9[df$d8 == 1] <- TRUE
Which results in:
d1 d2 d3 d4 d5 d6 d7 d8 d9
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
1 1 0 0 0 0 0 0 0 TRUE
2 0 0 0 0 0 0 0 0 NA
3 0 0 1 0 0 0 0 0 TRUE
4 0 0 0 0 0 0 0 0 NA
5 0 0 0 0 0 0 0 0 NA
6 0 0 0 0 0 0 0 0 NA
7 0 0 0 0 0 0 0 1 TRUE
8 0 1 0 0 0 0 0 0 TRUE
9 0 0 0 0 0 0 0 0 NA
10 0 0 0 0 0 0 0 0 NA
11 0 0 0 1 0 0 1 0 TRUE
12 0 0 0 0 0 0 0 0 NA
13 0 0 0 0 0 0 0 0 NA
14 0 0 0 0 0 0 0 0 NA
15 0 0 0 0 0 0 0 0 NA
But I am sure there must be a more elegant solution to this problem.
First, I would like to be able to create a variable containing the column names that need to be checked i.e. dfCols <- c("d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8")
and call this when checking, rather than explicitly calling each of 'd1', 'd2', etc.
I also feel there must be a tidyverse/dplyr solution out there, but have struggled to find one.
Thanks!
CodePudding user response:
base R
df$d9 <- rowSums(df == 1L) > 0
df
# # A tibble: 15 x 9
# d1 d2 d3 d4 d5 d6 d7 d8 d9
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
# 1 1 0 0 0 0 0 0 0 TRUE
# 2 0 0 0 0 0 0 0 0 FALSE
# 3 0 0 1 0 0 0 0 0 TRUE
# 4 0 0 0 0 0 0 0 0 FALSE
# 5 0 0 0 0 0 0 0 0 FALSE
# 6 0 0 0 0 0 0 0 0 FALSE
# 7 0 0 0 0 0 0 0 1 TRUE
# 8 0 1 0 0 0 0 0 0 TRUE
# 9 0 0 0 0 0 0 0 0 FALSE
# 10 0 0 0 0 0 0 0 0 FALSE
# 11 0 0 0 1 0 0 1 0 TRUE
# 12 0 0 0 0 0 0 0 0 FALSE
# 13 0 0 0 0 0 0 0 0 FALSE
# 14 0 0 0 0 0 0 0 0 FALSE
# 15 0 0 0 0 0 0 0 0 FALSE
(The comparison can be df == 1L
or df != 0
.)
dplyr
Really, rowSums
is the best, we can put it in a mutate
:
df %>%
mutate(d9 = rowSums(cur_data() == 1L) > 0)
CodePudding user response:
creditModel
library function rowAny
:
Try rowAny
function from creditModel
library:
library(creditmodel)
df$d9 <- rowAny(df == 1)
df
Output:
d1 d2 d3 d4 d5 d6 d7 d8 d9
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
1 1 0 0 0 0 0 0 0 TRUE
2 0 0 0 0 0 0 0 0 FALSE
3 0 0 1 0 0 0 0 0 TRUE
4 0 0 0 0 0 0 0 0 FALSE
5 0 0 0 0 0 0 0 0 FALSE
6 0 0 0 0 0 0 0 0 FALSE
7 0 0 0 0 0 0 0 1 TRUE
8 0 1 0 0 0 0 0 0 TRUE
9 0 0 0 0 0 0 0 0 FALSE
10 0 0 0 0 0 0 0 0 FALSE
11 0 0 0 1 0 0 1 0 TRUE
12 0 0 0 0 0 0 0 0 FALSE
13 0 0 0 0 0 0 0 0 FALSE
14 0 0 0 0 0 0 0 0 FALSE
15 0 0 0 0 0 0 0 0 FALSE
If you want NA
for FALSE
values, try replace
:
library(creditmodel)
x <- rowAny(df == 1)
df$d9 <- replace(x, x == FALSE, NA)
df
Output:
d1 d2 d3 d4 d5 d6 d7 d8 d9
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
1 1 0 0 0 0 0 0 0 TRUE
2 0 0 0 0 0 0 0 0 NA
3 0 0 1 0 0 0 0 0 TRUE
4 0 0 0 0 0 0 0 0 NA
5 0 0 0 0 0 0 0 0 NA
6 0 0 0 0 0 0 0 0 NA
7 0 0 0 0 0 0 0 1 TRUE
8 0 1 0 0 0 0 0 0 TRUE
9 0 0 0 0 0 0 0 0 NA
10 0 0 0 0 0 0 0 0 NA
11 0 0 0 1 0 0 1 0 TRUE
12 0 0 0 0 0 0 0 0 NA
13 0 0 0 0 0 0 0 0 NA
14 0 0 0 0 0 0 0 0 NA
15 0 0 0 0 0 0 0 0 NA
CodePudding user response:
Here a dplyr
way:
library(dplyr)
df %>%
rowwise() %>%
mutate(d9 = paste0(names(.)[c_across(everything()) == 1], collapse = ' '),
d9 = na_if(d9, ""),
d9 = ifelse(is.na(d9), FALSE, TRUE))
output:
d1 d2 d3 d4 d5 d6 d7 d8 d9
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
1 1 0 0 0 0 0 0 0 TRUE
2 0 0 0 0 0 0 0 0 FALSE
3 0 0 1 0 0 0 0 0 TRUE
4 0 0 0 0 0 0 0 0 FALSE
5 0 0 0 0 0 0 0 0 FALSE
6 0 0 0 0 0 0 0 0 FALSE
7 0 0 0 0 0 0 0 1 TRUE
8 0 1 0 0 0 0 0 0 TRUE
9 0 0 0 0 0 0 0 0 FALSE
10 0 0 0 0 0 0 0 0 FALSE
11 0 0 0 1 0 0 1 0 TRUE
12 0 0 0 0 0 0 0 0 FALSE
13 0 0 0 0 0 0 0 0 FALSE
14 0 0 0 0 0 0 0 0 FALSE
15 0 0 0 0 0 0 0 0 FALSE
CodePudding user response:
I don't knovv a dplyr solution but I think that the follovving is better:
df$d9 <- rowSums(df) > 0
EDIT
A dplyr way:
df %>% rowwise() %>% mutate(d9 = any(cur_data() == 1))
Another one:
df %>% mutate(d9 = if_any(fns = ~any(.x == 1)))
Benchmarks
library(dplyr)
library(microbenchmark)
nrows <- 20
ncols <- 20
set.seed(666)
M <- matrix(
sample.int(2, size = nrows*ncols, replace = TRUE, prob = c(0.95, 0.05)) - 1L,
nrow = nrows, ncol = ncols
)
rowSums(M)
df <- as_tibble(M)
microbenchmark(
rowSums = {
df$V21 <- rowSums(df) > 0L
},
dplyr_rowSums = {
df %>% mutate(V21 = rowSums(cur_data() == 1L) > 0L)
},
dplyr_rowwise_any = {
df %>% rowwise() %>% mutate(V21 = any(cur_data() == 1L))
},
dplyr_ifany = {
df %>% mutate(V21 = if_any(fns = ~any(.x == 1L)))
},
times = 20
)
# Unit: microseconds
# expr min lq mean median uq max neval cld
# rowSums 333.701 399.651 501.8161 503.6505 585.8005 743.601 20 a
# dplyr_rowSums 3387.401 3813.851 4353.3061 4163.6010 4879.6020 5600.700 20 b
# dplyr_rowwise_any 15224.700 17611.251 18966.8210 18589.3515 19717.5515 24586.001 20 d
# dplyr_ifany 4679.800 4945.251 6108.6458 5734.1510 6517.7505 11656.401 20 c
CodePudding user response:
Using if_any
in dplyr
-
library(dplyr)
df %>% mutate(d9 = if_any(.fns = ~.x == 1))
# d1 d2 d3 d4 d5 d6 d7 d8 d9
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
# 1 1 0 0 0 0 0 0 0 TRUE
# 2 0 0 0 0 0 0 0 0 FALSE
# 3 0 0 1 0 0 0 0 0 TRUE
# 4 0 0 0 0 0 0 0 0 FALSE
# 5 0 0 0 0 0 0 0 0 FALSE
# 6 0 0 0 0 0 0 0 0 FALSE
# 7 0 0 0 0 0 0 0 1 TRUE
# 8 0 1 0 0 0 0 0 0 TRUE
# 9 0 0 0 0 0 0 0 0 FALSE
#10 0 0 0 0 0 0 0 0 FALSE
#11 0 0 0 1 0 0 1 0 TRUE
#12 0 0 0 0 0 0 0 0 FALSE
#13 0 0 0 0 0 0 0 0 FALSE
#14 0 0 0 0 0 0 0 0 FALSE
#15 0 0 0 0 0 0 0 0 FALSE