There are many items (25 in the SDQ: strengths and difficulties scale) for which I would like to coalesce the 2to4yrs and 5to17yrs versions into the 2to4yrs columns.
library(tidyverse)
df <- data.frame(
sdq2to4yrs_item1 = c(1, NA, NA),
sdq5to17yrs_item1 = c(NA, NA, 2),
sdq2to4yrs_item2 = c(2, 2, NA),
sdq5to17yrs_item2 = c(1, 2, 3)
)
df
#> sdq2to4yrs_item1 sdq5to17yrs_item1 sdq2to4yrs_item2 sdq5to17yrs_item2
#> 1 1 NA 2 1
#> 2 NA NA 2 2
#> 3 NA 2 NA 3
## What I'm after
data.frame(
sdq2to4yrs_item1 = c(1, NA, 2),
sdq2to4yrs_item2 = c(2, 2, 3)
)
#> sdq2to4yrs_item1 sdq2to4yrs_item2
#> 1 1 2
#> 2 NA 2
#> 3 2 3
## The code I'd like to work
df %>%
mutate(
across(
matches("2to4yrs"),
~ coalesce(
!!!select(., matches(
cur_column() %>% str_remove(".*yrs_")
)
)
)
)
)
#> Error in local_error_context(dots = dots, .index = i, mask = mask): promise already under evaluation: recursive default argument reference or earlier problems?
Created on 2022-08-24 by the reprex package (v2.0.1)
CodePudding user response:
Here is an option that pivots longer, chooses the value per row/item, and then pivots back to wide
df %>%
mutate(i=row_number()) %>%
pivot_longer(-i, names_to=c("a", "item"), names_sep = "_" ,values_to = "v") %>%
group_by(i,item) %>%
summarize(k=if_else(is.na(v[1]),v[2],v[1]), .groups = "drop") %>%
pivot_wider(names_from = item, names_prefix = "sdq2to4years_", values_from=k)
Output:
id sdq2to4years_item1 sdq2to4years_item2
<int> <dbl> <dbl>
1 1 1 2
2 2 NA 2
3 3 2 3
or a couple of base R approaches:
setNames(
data.frame(sapply(seq(1, ncol(df),2),\(x) if_else(is.na(df[,x]),df[,x 1],df[,x]))),
paste0("sdq2to4yrs_item", 1:(ncol(df)/2))
)
or
t(apply(df,1,\(x) {
sapply(seq(1,length(x),2), \(i) ifelse(is.na(x[i]),x[i 1],x[i]))
})) %>% as.data.frame()
Output:
sdq2to4yrs_item1 sdq2to4yrs_item2
1 1 2
2 NA 2
3 2 3
CodePudding user response:
We may use split.default
library(purrr)
library(dplyr)
library(stringr)
df %>%
split.default(str_remove(names(.), ".*_")) %>%
map_dfc(~ invoke(coalesce, .x)) %>%
rename_with(~ str_c("sdq2to4yrs_", .x))
-output
# A tibble: 3 × 2
sdq2to4yrs_item1 sdq2to4yrs_item2
<dbl> <dbl>
1 1 2
2 NA 2
3 2 3
Or another option would be to loop across
the 'sdq2to4yrs' columns, get
the corresponding 'sdq5to17yrs' by replacing the substring from column name (cur_column()
) and do the coalesce
df %>%
transmute(across(starts_with('sdq2to4yrs'),
~ coalesce(.x, get(str_replace(cur_column(), "2to4yrs", "5to17yrs")))))
-output
sdq2to4yrs_item1 sdq2to4yrs_item2
1 1 2
2 NA 2
3 2 3
Or may also do
df %>%
transmute(map2_dfc(across(contains('2to4yrs')),
across(contains('5to17yrs')), coalesce))
sdq2to4yrs_item1 sdq2to4yrs_item2
1 1 2
2 NA 2
3 2 3
Or with across2
library(dplyover)
df %>%
transmute(across2(contains("2to4yrs"), contains("5to17yrs"),
coalesce, .names = "{xcol}"))
sdq2to4yrs_item1 sdq2to4yrs_item2
1 1 2
2 NA 2
3 2 3
Or using base R
data.frame(lapply(split.default(df, sub(".*_", "", names(df))),
function(x) do.call(pmax, c(x, na.rm = TRUE))))