I noticed this behavior below when
I used dplyr::case_when
instead of if
in reference to this article.
If the output of the second branch is an explicit string, it works as intended, but if the x
itself is specified, the result changes.
Why does only case_when
give different results?
x <- character(0)
dplyr::case_when(rlang::is_empty(x) ~ "Empty", !rlang::is_empty(x) ~ "Not empty")
#> [1] "Empty"
dplyr::case_when(rlang::is_empty(x) ~ "Empty", !rlang::is_empty(x) ~ x)
#> character(0)
if (rlang::is_empty(x)) "Empty" else if (!rlang::is_empty(x)) "Not empty"
#> [1] "Empty"
if (rlang::is_empty(x)) "Empty" else if (!rlang::is_empty(x)) x
#> [1] "Empty"
ifelse(rlang::is_empty(x), "Empty", "Not empty")
#> [1] "Empty"
ifelse(rlang::is_empty(x), "Empty", x)
#> [1] "Empty"
Created on 2022-08-16 by the reprex package (v2.0.1)
CodePudding user response:
This is likely a bug in case_when
or its internal helper functions.
We can put a browser
in the source code of case_when
to see what happens in both cases. Some internal functions must be called via :::
.
f <- function (...) {
browser()
fs <- dplyr:::compact_null(rlang::list2(...))
n <- length(fs)
error_call <- rlang::current_env()
if (n == 0) {
abort("No cases provided.", call = error_call)
}
query <- vector("list", n)
value <- vector("list", n)
default_env <- rlang::caller_env()
quos_pairs <- purrr::map2(fs, seq_along(fs), dplyr:::validate_formula, default_env = default_env,
dots_env = rlang::current_env(), error_call = error_call)
for (i in seq_len(n)) {
pair <- quos_pairs[[i]]
query[[i]] <- rlang::eval_tidy(pair$lhs, env = default_env)
value[[i]] <- rlang::eval_tidy(pair$rhs, env = default_env)
if (!is.logical(query[[i]])) {
dplyr:::abort_case_when_logical(pair$lhs, i, query[[i]],
error_call = error_call)
}
}
m <- dplyr:::validate_case_when_length(query, value, fs, error_call = error_call)
out <- value[[1]][rep(NA_integer_, m)]
replaced <- rep(FALSE, m)
for (i in seq_len(n)) {
out <- dplyr:::replace_with(out, query[[i]] & !replaced, value[[i]],
NULL, error_call = error_call)
replaced <- replaced | (query[[i]] & !is.na(query[[i]]))
}
out
}
and the helper internal replace_with
in dplyr
,
replacer <- function (x, i, val, name, reason = NULL, error_call = rlang::caller_env()) {
if (is.null(val)) {
return(x)
}
dplyr:::check_length(val, x, name, reason, error_call = error_call)
dplyr:::check_type(val, x, name, error_call = error_call)
dplyr:::check_class(val, x, name, error_call = error_call)
i[is.na(i)] <- FALSE
if (length(val) == 1L) {
x[i] <- val
}
else {
x[i] <- val[i]
}
x
}
and then debug via
x <- character(0)
f(rlang::is_empty(x) ~ "Empty", !rlang::is_empty(x) ~ "x")
f(rlang::is_empty(x) ~ "Empty", !rlang::is_empty(x) ~ x)
the key is in value m
, that in the working case results in 1L
, in the faulty case it is 0L
. out
becomes character(0)
instead of initializing to NA
, length 1.
replaced
should be a logical vector indicating whether a value has been replaced. In the faulty case rep(FALSE, 0L)
is logical(0)
, which is queried later on via !replaced
. FALSE & logical(0)
gives logical(0)
.
When passed to replacer
this gives a peculiar subsetting action character(0)[logical(0)]
, that gives character(0)
.
CodePudding user response:
One big difference between case_when
and ifelse
is that case_when
coerces the right-hand side arguments into the same type. From ?case_when
:
The RHS does not need to be logical, but all RHSs must evaluate to the same type of vector.
It might also throw errors.
dplyr::case_when(rlang::is_empty(x) ~ 1, !rlang::is_empty(x) ~ "2")
# Error in names(message) <- `*vtmp*` :
# 'names' attribute [1] must be the same length as the vector [0]
dplyr::case_when(rlang::is_empty(x) ~ "1", !rlang::is_empty(x) ~ "2")
[1] "1"
These messages could for sure be more specific.
As mentioned, this does not explain what I think is a bug within the case_when
source code, specifically validate_case_when_length
:
m <- dplyr:::validate_case_when_length(query, value, fs, error_call = error_call)
which erroneously returns 0 in this case.
mycase_when <- function (...)
{
fs <- dplyr:::compact_null(rlang:::list2(...))
n <- length(fs)
error_call <- current_env()
if (n == 0) {
abort("No cases provided.", call = error_call)
}
query <- vector("list", n)
value <- vector("list", n)
default_env <- caller_env()
quos_pairs <- dplyr:::map2(fs, seq_along(fs), dplyr:::validate_formula, default_env = default_env,
dots_env = current_env(), error_call = error_call)
for (i in seq_len(n)) {
pair <- quos_pairs[[i]]
query[[i]] <- eval_tidy(pair$lhs, env = default_env)
value[[i]] <- eval_tidy(pair$rhs, env = default_env)
if (!is.logical(query[[i]])) {
abort_case_when_logical(pair$lhs, i, query[[i]],
error_call = error_call)
}
}
m <- dplyr:::validate_case_when_length(query, value, fs, error_call = error_call)
print(paste('m',m))
print(paste('value', value))
print(paste('value[[1]][1]', value[[1]][1]))
out <- value[[1]][rep(NA_integer_, m)]
replaced <- rep(FALSE, m)
print(paste('out 1',out))
for (i in seq_len(n)) {
out <- dplyr:::replace_with(out, query[[i]] & !replaced, value[[i]],
NULL, error_call = error_call)
replaced <- replaced | (query[[i]] & !is.na(query[[i]]))
}
# print(paste('out 2',out))
out
}
mycase_when(rlang::is_empty(x) ~ "Empty", !rlang::is_empty(x) ~ "Not empty")
mycase_when(rlang::is_empty(x) ~ "Empty", !rlang::is_empty(x) ~ x)
mycase_when(rlang::is_empty(x) ~ "Empty", !rlang::is_empty(x) ~ "Not empty")
[1] "m 1"
[1] "value Empty" "value Not empty"
[1] "value[[1]][1] Empty"
[1] "out 1 NA"
[1] "Empty"
mycase_when(rlang::is_empty(x) ~ "Empty", !rlang::is_empty(x) ~ x)
[1] "m 0"
[1] "value Empty" "value character(0)"
[1] "value[[1]][1] Empty"
[1] "out 1 "
character(0)
CodePudding user response:
This is because case_when
tries to recycle all formulas to a common length. The documentation says:
Both LHS and RHS may have the same length of either 1 or n. The value of n must be consistent across all cases. The case of n == 0 is treated as a variant of n != 1.
In your second example you've supplied a 0-length vector as the RHS of one of the formulas and this is the reason that a 0-length vector is returned; it's just a coincidence that the value returned is the same as the RHS of false case.
This behaviour might seem less surprising when considering these examples:
library(dplyr, warn.conflicts = FALSE)
case_when(TRUE ~ 1, FALSE ~ numeric(0))
#> numeric(0)
case_when(TRUE ~ 1, FALSE ~ numeric(1))
#> [1] 1
case_when(TRUE ~ 1, FALSE ~ numeric(2))
#> [1] 1 1
case_when(TRUE ~ 1, FALSE ~ numeric(3))
#> [1] 1 1 1