I would like to obtain a generic formula to arrange dataframes with a varying number of columns.
For example, in this case the dataframe contains "categ_1, categ_2, points_1, points_2":
library(tidyverse)
set.seed(1)
nrows <- 20
df <- tibble(
other_text = sample(letters,
nrows, replace = TRUE),
categ_1 = sample(c("A", "B"), nrows, replace = TRUE),
categ_2 = sample(c("A", "B"), nrows, replace = TRUE),
points_1 = sample(20:25, nrows, replace = TRUE),
points_2 = sample(20:25, nrows, replace = TRUE),
) %>%
rowwise() %>%
mutate(total = sum(c_across(starts_with("points_")))) %>%
ungroup()
And the formula to arrange:
df %>%
arrange(
desc(total),
categ_1, categ_2,
desc(points_1), desc(points_2)
)
But df could have more columns: "categ_1, categ_2, categ_3, points_1, points_2, points_3". So, in that case, the formula should be:
df %>%
mutate(
categ_3 = sample(c("A", "B"), nrows, replace = TRUE),
points_3 = sample(20:25, nrows, replace = TRUE),
) %>%
rowwise() %>%
mutate(total = sum(c_across(starts_with("points_")))) %>%
ungroup() %>%
arrange(
desc(total),
categ_1, categ_2, categ_3,
desc(points_1), desc(points_2), desc(points_3)
)
I tried writing a generic formula (using across
):
library(daff)
daff::diff_data(
df %>%
arrange(
desc(total),
categ_1, categ_2,
desc(points_1), desc(points_2)
)
,
df %>%
arrange(
desc(total),
across(starts_with("categ_")),
across(starts_with("points_"), desc)
)
)
#> Daff Comparison: ‘df %>% arrange(desc(total), categ_1, categ_2, desc(points_1), ’ ‘ desc(points_2))’ vs. ‘df %>% arrange(desc(total), across(starts_with("categ_")), across(starts_with("points_"), ’ ‘ desc))’
#> A:A B:B ... E:E F:F
#> @@ other_text categ_1 ... points_2 total
#> ... ... ... ... ... ...
#> 10:9 z A ... 23 45
#> 9:10 : v A ... 22 45
#> 11:11 s B ... 23 45
#> ... ... ... ... ... ...
It seems like a bug in arrange
: arrange
only considers the parameters until the first across
.
I also tried writing the conditions inside a case_when
but couldn't find the correct syntax:
# not working
df %>%
arrange(
across(everything(), ~ case_when(
. == "total" ~ .,
str_detect(., "categ_") ~ .,
str_detect(., "points_") ~ desc(.),
TRUE ~ 1
)
)
)
#> Error in `arrange()`:
#> ! Problem with the implicit `transmute()` step.
What would be the generic way of writing that formula inside arrange
?
(Other alternatives are welcome but I would prefer a tidyverse solution.)
CodePudding user response:
An easy (sort of) solution would be to use cur_column()
to determine the column name which eventually determines the sorting order:
Data
library(dplyr)
library(daff)
set.seed(32211)
nrows <- 20
df <- tibble(
other_text = sample(letters,
nrows, replace = TRUE),
categ_1 = sample(c("A", "B"), nrows, replace = TRUE),
categ_2 = sample(c("A", "B"), nrows, replace = TRUE),
points_1 = sample(20:25, nrows, replace = TRUE),
points_2 = sample(20:25, nrows, replace = TRUE),
) %>%
rowwise() %>%
mutate(total = sum(c_across(starts_with("points_")))) %>%
ungroup()
Comparison
by_hand <- df %>%
arrange(
desc(total),
categ_1, categ_2,
desc(points_1), desc(points_2)
)
with_across <- df %>%
arrange(
across(
c(total, starts_with("categ_"), starts_with("points_")),
.fns = function(x) {
if (grepl("^total$|^points_.*$", cur_column()))
desc(x)
else
x
})
)
diff_data(by_hand, with_across)
# Daff Comparison: ‘by_hand’ vs. ‘with_across’
# other_text categ_1 categ_2 points_1 points_2 total
CodePudding user response:
Latest, super-simple fix
Install development version:
# remotes::install_github("tidyverse/dplyr")
library(tidyverse)
set.seed(144)
nrows <- 20
df <- tibble(
other_text = sample(letters,
nrows, replace = FALSE),
categ_1 = sample(c("A", "B"), nrows, replace = TRUE),
categ_2 = sample(c("A", "B"), nrows, replace = TRUE),
points_1 = sample(1:25, nrows, replace = FALSE),
points_2 = sample(100:125, nrows, replace = FALSE),
) %>%
rowwise() %>%
mutate(total = sum(c_across(starts_with("points_")))) %>%
ungroup()
out1 <- df %>%
arrange(
desc(total),
categ_1, categ_2,
desc(points_1), desc(points_2)
)
out2 <- df %>%
arrange(
desc(total),
across(starts_with("categ_")),
across(starts_with("points_"), desc)
)
daff::diff_data(out1, out2)
#> Daff Comparison: 'out1' vs. 'out2'
#> other_text categ_1 ...
CodePudding user response:
You could try just wrapping everything in the arrange()
in a data frame. Looks like arrange()
does some code manipulation to handle top-level desc()
calls in a special manner, which has a bad interaction with the data frames created by across()
. But that can be avoided using the data frame unpacking feature.
library(tidyverse)
set.seed(3)
nrows <- 20
df <- tibble(
other_text = sample(letters, nrows, replace = TRUE),
categ_1 = sample(c("A", "B"), nrows, replace = TRUE),
categ_2 = sample(c("A", "B"), nrows, replace = TRUE),
points_1 = sample(20:25, nrows, replace = TRUE),
points_2 = sample(20:25, nrows, replace = TRUE),
) %>%
rowwise() %>%
mutate(total = sum(c_across(starts_with("points_")))) %>%
ungroup()
identical(
df %>%
arrange(
desc(total),
categ_1, categ_2,
desc(points_1), desc(points_2)
),
df %>%
arrange(
tibble(
desc(total),
across(starts_with("categ_")),
across(starts_with("points_"), desc)
)
)
)
#> [1] TRUE