Consider this sample data
# A tibble: 10 x 3
x y z
<int> <dbl> <dbl>
1 1 1 5
2 2 3 6
3 3 4 7
4 4 7 8
5 5 NA 9
6 6 12 10
7 7 NA NA
8 8 2 20
9 9 5.5 25
10 10 NA 8
I would like to mutate a new column value
that rowSums if there is no NAs
present in any of the columns.
If there are, take the highest value in the row times
1.2
.BUT, if there are only one column with a value, take that value
Finally, another column
NA_column
with the names of columns containingNA
in that row!
What I have in mind, but could not figure out the rest.
df %>%
mutate(
value = case_when(any_vars(is.na()) ~ BIGGEST VALUE * 1.2,
TRUE ~ rowsum()),
NA_column = columns that has NA in that row
)
DATA
df <- tibble(
x = 1:10,
y = c(1, 3, 4, 7, NA, 12, NA, 2, 5.5, NA),
z = c(5:10, NA, 20, 25, 8)
)
CodePudding user response:
library(dplyr); library(magrittr)
# Take the max of the columns
df_out<- df %>%
rowwise() %>%
mutate(
value = ifelse(anyNA(c_across(everything())), max(c_across(everything()), na.rm = T) * 1.2, x) # or 'y' or 'z' instead of x
) %>%
ungroup()
# Add NA_column
df_out$NA_column <- sapply(1:nrow(df), \(i) {
df[i, ] %$%
colnames(.)[is.na(.)] %>%
paste(collapse = " | ")
})
df_out
# A tibble: 10 × 5
x y z value NA_column
<int> <dbl> <dbl> <dbl> <chr>
1 1 1 5 1 ""
2 2 3 6 2 ""
3 3 4 7 3 ""
4 4 7 8 4 ""
5 5 NA 9 10.8 "y"
6 6 12 10 6 ""
7 7 NA NA 8.4 "y | z"
8 8 2 20 8 ""
9 9 5.5 25 9 ""
10 10 NA 8 12 "y"
There is probably a more direct solution for the second part
CodePudding user response:
Often when solving a problem such as this I find it best to lay out the solution in discrete steps. In this case, using tidyverse syntax, it is possible to create temporary columns containing the bits of data needed to ultimately compute the desired value.
I couldn't immediately improve upon the solution provided for the second part (NA_column) by @Julien above, so I've added that code chunk below.
df <- tibble(
x = 1:10,
y = c(1, 3, 4, 7, NA, 12, NA, 2, 5.5, NA),
z = c(5:10, NA, 20, 25, 8)
)
out <-
df %>%
mutate(
# get number of columns of data
num_cols = ncol(.),
# get number of values in row that are not NA
num_not_na = rowSums(!is.na(.))
) %>%
rowwise() %>%
mutate(
# get maximum value of data in row, note we need to be explicit about which columns are data, e.g., (1:3)
max_value = max(across(1:3), na.rm = TRUE)
) %>%
ungroup() %>%
mutate(
# get the desired value for the row
# if there are no NA values or only one non-NA value then we can just get the row sum,
# again we need to be explicit about the columns, e.g. [, 1:3]
# otherwise take the maximum value multiplied by 1.2
value = ifelse(num_cols == num_not_na | num_not_na == 1, rowSums(.[, 1:3], na.rm = TRUE), max_value * 1.2)
)
# with credit to @Julien for the following code to get the names of the NA columns
out$NA_column <- sapply(1:nrow(out), function(i) {
out[i, ] %$%
colnames(.)[is.na(.)] %>%
paste(collapse = " | ")
})
# can optionally remove the temporary columns
out <-
out %>%
dplyr::select(
-c(num_cols, num_not_na, max_value)
)
# > out
# # A tibble: 10 x 5
# x y z value NA_column
# <int> <dbl> <dbl> <dbl> <chr>
# 1 1 1 5 7 ""
# 2 2 3 6 11 ""
# 3 3 4 7 14 ""
# 4 4 7 8 19 ""
# 5 5 NA 9 10.8 "y"
# 6 6 12 10 28 ""
# 7 7 NA NA 7 "y | z"
# 8 8 2 20 30 ""
# 9 9 5.5 25 39.5 ""
#10 10 NA 8 12 "y"