Home > Blockchain >  Mutate multiple columns using the dplyr framework
Mutate multiple columns using the dplyr framework

Time:05-06

I have a dataframe apcd_hud_ex. I want to take some column names (e.g. x2014_03_15), and change the value of the columns based on the current value of the columns, the parsed date in the column names, and another column in the dataframe (SMOKEFREE_DATE). I can do it in a loop over the columns, but I would really like to know how to do it with dplyr and mutate. Any help would be much appreciated!

apcd_hud_ex = structure(list(studyid = 1:5, SMOKEFREE_DATE = structure(c(16283, 
16283, 16071, 16071, 16648), class = "Date"), x2014_03_15 = c(1, 
1, 1, 0, 1), x2014_04_15 = c(1, 1, 1, 1, 1), x2014_05_15 = c(1, 
1, 1, 1, 1), x2014_06_15 = c(1, 1, 1, 1, 1), x2014_07_15 = c(1, 
1, 1, 1, 1), x2014_08_15 = c(1, 1, 1, 1, 1), x2014_09_15 = c(1, 
1, 1, 1, 1), x2014_10_15 = c(1, 1, 1, 1, 1), x2014_11_15 = c(1, 
1, 1, 1, 1), x2014_12_15 = c(1, 1, 1, 1, 1), x2015_01_15 = c(1, 
1, 1, 1, 1)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", 
"data.frame"))

> apcd_hud_ex
# A tibble: 5 x 13
  studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15 x2014_07_15 x2014_08_15 x2014_09_15 x2014_10_15
    <int> <date>               <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
1       1 2014-08-01               1           1           1           1           1           1           1           1
2       2 2014-08-01               1           1           1           1           1           1           1           1
3       3 2014-01-01               1           1           1           1           1           1           1           1
4       4 2014-01-01               0           1           1           1           1           1           1           1
5       5 2015-08-01               1           1           1           1           1           1           1           1
# ... with 3 more variables: x2014_11_15 <dbl>, x2014_12_15 <dbl>, x2015_01_15 <dbl>
>


#function for loop
assign_PHRes_enrollIns_fn <- function(SFdate,insValue,insDate){
  val = if_else(insValue == 0,
                0,
                if_else(as.Date(insDate) < as.Date(SFdate,"%Y-%m-%d"),
                        1,
                        2))
  return(val)
}

#vectorized function
assign_PHRes_enrollIns_fn_vec <- Vectorize(assign_PHRes_enrollIns_fn)

dateCols = names(apcd_hud_ex)[which(names(apcd_hud_ex) == "x2014_03_15"):which(names(apcd_hud_ex) == "x2015_01_15")]

This loop over the column names (dateCols) works:

for(i in 1:length(dateCols)){
  dateCol = dateCols[i]
  insDate = as.Date(paste0(str_sub(dateCol,2,5),"/",str_sub(dateCol,7,8),"/",str_sub(dateCol,10,11)),"%Y/%m/%d")
  apcd_hud_ex[,dateCol] = assign_PHRes_enrollIns_fn_vec(apcd_hud_ex[,"SMOKEFREE_DATE"],apcd_hud_ex[,dateCol],insDate)
}

Now the manipulated dataframe looks like this, which is what I want:

> apcd_hud_ex
# A tibble: 5 x 13
  studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15 x2014_07_15 x2014_08_15 x2014_09_15 x2014_10_15
    <int> <date>               <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
1       1 2014-08-01               1           1           1           1           1           2           2           2
2       2 2014-08-01               1           1           1           1           1           2           2           2
3       3 2014-01-01               2           2           2           2           2           2           2           2
4       4 2014-01-01               0           2           2           2           2           2           2           2
5       5 2015-08-01               1           1           1           1           1           1           1           1
# ... with 3 more variables: x2014_11_15 <dbl>, x2014_12_15 <dbl>, x2015_01_15 <dbl>

However, I would like to learn how to do this with dynamic programming and dplyr. I've tried 2 functions:

newInsValCols_fn1 <- function(df,dateCols){
  insDate = as.Date(paste0(str_sub(dateCols,2,5),"/",str_sub(dateCols,7,8),"/",str_sub(dateCols,10,11)),"%Y/%m/%d")

  df1 <- df %>%
    mutate({{dateCols}} := if_else({{dateCols}} == 0,
                                   0,
                                   if_else(as.Date(insDate) < as.Date(SMOKEFREE_DATE,"%Y-%m-%d"),
                                           1,
                                           2)))
 return(df1)
} 
newInsValCols_fn1(apcd_hud_ex,dateCols)

Which gives error:

 Error: The LHS of `:=` must be a string or a symbol

So I tried using symbols:

newInsValCols_fn2 <- function(df,dateCols){
  dateCols_syms = syms(dateCols)
  insDate = as.Date(paste0(str_sub(dateCols,2,5),"/",str_sub(dateCols,7,8),"/",str_sub(dateCols,10,11)),"%Y/%m/%d")
  df1 <- df %>%
    mutate(!!dateCols_syms := if_else({{dateCols}} == 0,
                                      0,
                                      if_else(as.Date(insDate) < as.Date(SMOKEFREE_DATE,"%Y-%m-%d"),
                                              1,
                                              2)))
  return(df1)
} 
newInsValCols_fn2(apcd_hud_ex,dateCols)

which gives the same error:

Error: The LHS of `:=` must be a string or a symbol

I also tried using !!! instead of !!, but that resulted in the following error:

 Error: The LHS of `:=` can't be spliced with `!!!`

Something in my understanding is lacking.

CodePudding user response:

Here's how I'd do it with dplyr.

library(dplyr)
library(lubridate)

apcd_hud_ex %>%
  mutate(across(
    starts_with('x'),
    ~ case_when(. == 0 ~ 0,
                ymd(gsub('x', '', cur_column())) < SMOKEFREE_DATE ~ 1,
                TRUE ~ 2)
  ))

#> # A tibble: 5 x 13
#>   studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15 x2014_07_15 x2014_08_15 x2014_09_15 x2014_10_15 x2014_11_15 x2014_12_15 x2015_01_15
#>     <int> <date>               <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
#> 1       1 2014-08-01               1           1           1           1           1           2           2           2           2           2           2
#> 2       2 2014-08-01               1           1           1           1           1           2           2           2           2           2           2
#> 3       3 2014-01-01               2           2           2           2           2           2           2           2           2           2           2
#> 4       4 2014-01-01               0           2           2           2           2           2           2           2           2           2           2
#> 5       5 2015-08-01               1           1           1           1           1           1           1           1           1           1           1

CodePudding user response:

You can use pivot_longer to have just one column to modify, which is an alternative to mutate(across()).

You can use case_when to have multiple conditions, so you do not need to nest multiple if statements. The value will be the one of the first true statement.

library(tidyverse)

apcd_hud_ex <- structure(list(studyid = 1:5, SMOKEFREE_DATE = structure(c(
  16283,
  16283, 16071, 16071, 16648
), class = "Date"), x2014_03_15 = c(
  1,
  1, 1, 0, 1
), x2014_04_15 = c(1, 1, 1, 1, 1), x2014_05_15 = c(
  1,
  1, 1, 1, 1
), x2014_06_15 = c(1, 1, 1, 1, 1), x2014_07_15 = c(
  1,
  1, 1, 1, 1
), x2014_08_15 = c(1, 1, 1, 1, 1), x2014_09_15 = c(
  1,
  1, 1, 1, 1
), x2014_10_15 = c(1, 1, 1, 1, 1), x2014_11_15 = c(
  1,
  1, 1, 1, 1
), x2014_12_15 = c(1, 1, 1, 1, 1), x2015_01_15 = c(
  1,
  1, 1, 1, 1
)), row.names = c(NA, -5L), class = c(
  "tbl_df", "tbl",
  "data.frame"
))

apcd_hud_ex %>%
  pivot_longer(starts_with("x")) %>%
  mutate(
    insDate = name %>% str_remove("^x") %>% str_replace_all("_", "-") %>% as.Date(),
    value = case_when(
      value == 0 ~ 0,
      insDate < SMOKEFREE_DATE ~ 1,
      insDate >= SMOKEFREE_DATE ~ 2
    )
  ) %>%
  select(-insDate) %>%
  pivot_wider()
#> # A tibble: 5 × 13
#>   studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15
#>     <int> <date>               <dbl>       <dbl>       <dbl>       <dbl>
#> 1       1 2014-08-01               1           1           1           1
#> 2       2 2014-08-01               1           1           1           1
#> 3       3 2014-01-01               2           2           2           2
#> 4       4 2014-01-01               0           2           2           2
#> 5       5 2015-08-01               1           1           1           1
#> # … with 7 more variables: x2014_07_15 <dbl>, x2014_08_15 <dbl>,
#> #   x2014_09_15 <dbl>, x2014_10_15 <dbl>, x2014_11_15 <dbl>, x2014_12_15 <dbl>,
#> #   x2015_01_15 <dbl>

Created on 2022-05-05 by the reprex package (v2.0.0)

  • Related