How to group and sum with a special condition?-CodePudding

My test dataset

structure(list(record_id = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L), ev_faudit = structure(c(19037, 19037, 19037, 19038, 
19038, 19039, 19039, 19040, 19040, 19041, 19041), class = "Date"), 
    redcap_repeat_instance = c(41L, 41L, 41L, 42L, 42L, 43L, 
    43L, 44L, 44L, 45L, 45L), ne_cc = c("119697xxxx ", "119697xxxx ", 
    "119697xxxx ", "119697xxxx ", "119697xxxx ", "119697xxxx ", 
    "119697xxxx ", "119697xxxx ", "119697xxxx ", "119697xxxx ", 
    "119697xxxx "), nom_ips = structure(c(2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L), redcapLabels = c("location1", 
    "location2"), redcapLevels = 1:2), ne_auditor = c("user1", 
    "user1", "user1", "user1", "user1", 
    "user1", "user1", "user1", "user1", 
    "user1", "user1"), ev_dias1 = c(2, 2, 2, 
    1, 1, 1, 1, 1, 1, 1, 1), var2 = c("ev_hallazgo1___1", "ev_apoyo_dx1___9", 
    "ev_apoyo_dx1___13", "ev_hallazgo1___4", "ev_costo_oportunidad___1", 
    "ev_hallazgo1___4", "ev_costo_oportunidad___1", "ev_hallazgo1___1", 
    "ev_apoyo_dx1___13", "ev_hallazgo1___1", "ev_apoyo_dx1___13"
    ), var1 = c("Ineficiencia", "Videotelemetria", "Polisomnografia", 
    "Costo de oportunidad", "Demora tramite ERP", "Costo de oportunidad", 
    "Demora tramite ERP", "Ineficiencia", "Polisomnografia", 
    "Ineficiencia", "Polisomnografia"), val1 = c(NA, 70000, 150000, 
    NA, 450000, NA, 450000, NA, 150000, NA, 150000)), row.names = c(NA, 
-11L), class = c("tbl_df", "tbl", "data.frame"))

I require to group by record_id and redcap_repeat_instance and place the resulting sum of all the values from val1 next to the equivalent val1 when var2 starts with ev_hallazgo1

Desired output

structure(list(record_id = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L), ev_faudit = structure(c(19037, 19037, 19037, 19038, 
19038, 19039, 19039, 19040, 19040, 19041, 19041), class = "Date"), 
    redcap_repeat_instance = c(41L, 41L, 41L, 42L, 42L, 43L, 
    43L, 44L, 44L, 45L, 45L), ne_cc = c("119697xxxx ", "119697xxxx ", 
    "119697xxxx ", "119697xxxx ", "119697xxxx ", "119697xxxx ", 
    "119697xxxx ", "119697xxxx ", "119697xxxx ", "119697xxxx ", 
    "119697xxxx "), nom_ips = structure(c(2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L), redcapLabels = c("location1", 
    "location2"), redcapLevels = 1:2), ne_auditor = c("user1", 
    "user1", "user1", "user1", "user1", 
    "user1", "user1", "user1", "user1", 
    "user1", "user1"), ev_dias1 = c(2, 2, 2, 
    1, 1, 1, 1, 1, 1, 1, 1), var2 = c("ev_hallazgo1___1", "ev_apoyo_dx1___9", 
    "ev_apoyo_dx1___13", "ev_hallazgo1___4", "ev_costo_oportunidad___1", 
    "ev_hallazgo1___4", "ev_costo_oportunidad___1", "ev_hallazgo1___1", 
    "ev_apoyo_dx1___13", "ev_hallazgo1___1", "ev_apoyo_dx1___13"
    ), var1 = c("Ineficiencia", "Videotelemetria", "Polisomnografia", 
    "Costo de oportunidad", "Demora tramite ERP", "Costo de oportunidad", 
    "Demora tramite ERP", "Ineficiencia", "Polisomnografia", 
    "Ineficiencia", "Polisomnografia"), val1 = c(220000, 70000, 150000, 
    45000, 450000, 45000, 450000, 150000, 150000, 150000, 150000)), row.names = c(NA, 
-11L), class = c("tbl_df", "tbl", "data.frame"))

CodePudding user response：

We can either create the condition on NA (is.na) or from the specific substring of 'var1' - grouped by the 'record_id', and 'redcap_repeat_instance', replace the 'val1' where the 'var2' starts with substring with sum of all the values in 'val1' for that group

library(dplyr)
library(stringr)
test %>% 
  group_by(record_id, redcap_repeat_instance) %>% 
  mutate(val1 = replace(val1, str_detect(var2, "^ev_hallazgo1"), 
        sum(val1, na.rm = TRUE))) %>%
  ungroup

-output

# A tibble: 11 × 10
   record_id ev_faudit  redcap_repeat_instance ne_cc         nom_ips ne_auditor ev_dias1 var2                     var1                   val1
       <int> <date>                      <int> <chr>           <int> <chr>         <dbl> <chr>                    <chr>                 <dbl>
 1         5 2022-02-14                     41 "119697xxxx "       2 user1             2 ev_hallazgo1___1         Ineficiencia         220000
 2         5 2022-02-14                     41 "119697xxxx "       2 user1             2 ev_apoyo_dx1___9         Videotelemetria       70000
 3         5 2022-02-14                     41 "119697xxxx "       2 user1             2 ev_apoyo_dx1___13        Polisomnografia      150000
 4         5 2022-02-15                     42 "119697xxxx "       2 user1             1 ev_hallazgo1___4         Costo de oportunidad 450000
 5         5 2022-02-15                     42 "119697xxxx "       2 user1             1 ev_costo_oportunidad___1 Demora tramite ERP   450000
 6         5 2022-02-16                     43 "119697xxxx "       2 user1             1 ev_hallazgo1___4         Costo de oportunidad 450000
 7         5 2022-02-16                     43 "119697xxxx "       2 user1             1 ev_costo_oportunidad___1 Demora tramite ERP   450000
 8         5 2022-02-17                     44 "119697xxxx "       2 user1             1 ev_hallazgo1___1         Ineficiencia         150000
 9         5 2022-02-17                     44 "119697xxxx "       2 user1             1 ev_apoyo_dx1___13        Polisomnografia      150000
10         5 2022-02-18                     45 "119697xxxx "       2 user1             1 ev_hallazgo1___1         Ineficiencia         150000
11         5 2022-02-18                     45 "119697xxxx "       2 user1             1 ev_apoyo_dx1___13        Polisomnografia      150000

If those are the only NAs, another option is coalesce

test %>% 
 group_by(record_id, redcap_repeat_instance)  %>%
 mutate(val1 = coalesce(val1, sum(val1, na.rm = TRUE))) %>% 
 ungroup