Home > Enterprise >  How to coalesce multiple columns using tidyverse cur_column()
How to coalesce multiple columns using tidyverse cur_column()

Time:08-25

There are many items (25 in the SDQ: strengths and difficulties scale) for which I would like to coalesce the 2to4yrs and 5to17yrs versions into the 2to4yrs columns.

library(tidyverse)
df <- data.frame(
  sdq2to4yrs_item1 = c(1, NA, NA),
  sdq5to17yrs_item1 = c(NA, NA, 2),
  sdq2to4yrs_item2 = c(2, 2, NA),
  sdq5to17yrs_item2 = c(1, 2, 3)
)

df
#>   sdq2to4yrs_item1 sdq5to17yrs_item1 sdq2to4yrs_item2 sdq5to17yrs_item2
#> 1                1                NA                2                 1
#> 2               NA                NA                2                 2
#> 3               NA                 2               NA                 3

## What I'm after
data.frame(
  sdq2to4yrs_item1 = c(1, NA, 2),
  sdq2to4yrs_item2 = c(2, 2, 3)
)
#>   sdq2to4yrs_item1 sdq2to4yrs_item2
#> 1                1                2
#> 2               NA                2
#> 3                2                3

## The code I'd like to work

df %>%
  mutate(
    across(
      matches("2to4yrs"),
      ~ coalesce(
        !!!select(., matches(
          cur_column() %>% str_remove(".*yrs_")
        )
        )
      )
    )
  )
#> Error in local_error_context(dots = dots, .index = i, mask = mask): promise already under evaluation: recursive default argument reference or earlier problems?

Created on 2022-08-24 by the reprex package (v2.0.1)

CodePudding user response:

Here is an option that pivots longer, chooses the value per row/item, and then pivots back to wide

df %>% 
  mutate(i=row_number()) %>% 
  pivot_longer(-i, names_to=c("a", "item"), names_sep = "_" ,values_to = "v") %>% 
  group_by(i,item) %>% 
  summarize(k=if_else(is.na(v[1]),v[2],v[1]), .groups = "drop") %>% 
  pivot_wider(names_from = item, names_prefix = "sdq2to4years_", values_from=k)

Output:

     id sdq2to4years_item1 sdq2to4years_item2
  <int>              <dbl>              <dbl>
1     1                  1                  2
2     2                 NA                  2
3     3                  2                  3

or a couple of base R approaches:

setNames(
  data.frame(sapply(seq(1, ncol(df),2),\(x) if_else(is.na(df[,x]),df[,x 1],df[,x]))),
  paste0("sdq2to4yrs_item", 1:(ncol(df)/2))
)

or

t(apply(df,1,\(x) {
  sapply(seq(1,length(x),2), \(i) ifelse(is.na(x[i]),x[i 1],x[i]))
})) %>% as.data.frame()

Output:

  sdq2to4yrs_item1 sdq2to4yrs_item2
1                1                2
2               NA                2
3                2                3

CodePudding user response:

We may use split.default

library(purrr)
library(dplyr)
library(stringr)
df %>% 
   split.default(str_remove(names(.), ".*_")) %>%
   map_dfc(~ invoke(coalesce, .x)) %>%
   rename_with(~ str_c("sdq2to4yrs_", .x))

-output

# A tibble: 3 × 2
  sdq2to4yrs_item1 sdq2to4yrs_item2
             <dbl>            <dbl>
1                1                2
2               NA                2
3                2                3

Or another option would be to loop across the 'sdq2to4yrs' columns, get the corresponding 'sdq5to17yrs' by replacing the substring from column name (cur_column()) and do the coalesce

df %>%
   transmute(across(starts_with('sdq2to4yrs'), 
    ~ coalesce(.x, get(str_replace(cur_column(), "2to4yrs", "5to17yrs")))))

-output

   sdq2to4yrs_item1 sdq2to4yrs_item2
1                1                2
2               NA                2
3                2                3

Or may also do

df %>% 
  transmute(map2_dfc(across(contains('2to4yrs')), 
    across(contains('5to17yrs')), coalesce))
  sdq2to4yrs_item1 sdq2to4yrs_item2
1                1                2
2               NA                2
3                2                3

Or with across2

library(dplyover)
df %>%
   transmute(across2(contains("2to4yrs"), contains("5to17yrs"), 
      coalesce, .names = "{xcol}"))
   sdq2to4yrs_item1 sdq2to4yrs_item2
1                1                2
2               NA                2
3                2                3

Or using base R

data.frame(lapply(split.default(df, sub(".*_", "", names(df))), 
     function(x) do.call(pmax, c(x, na.rm = TRUE))))
  •  Tags:  
  • r
  • Related