My test dataset
structure(list(record_id = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L), ev_faudit = structure(c(19037, 19037, 19037, 19038,
19038, 19039, 19039, 19040, 19040, 19041, 19041), class = "Date"),
redcap_repeat_instance = c(41L, 41L, 41L, 42L, 42L, 43L,
43L, 44L, 44L, 45L, 45L), ne_cc = c("119697xxxx ", "119697xxxx ",
"119697xxxx ", "119697xxxx ", "119697xxxx ", "119697xxxx ",
"119697xxxx ", "119697xxxx ", "119697xxxx ", "119697xxxx ",
"119697xxxx "), nom_ips = structure(c(2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), redcapLabels = c("location1",
"location2"), redcapLevels = 1:2), ne_auditor = c("user1",
"user1", "user1", "user1", "user1",
"user1", "user1", "user1", "user1",
"user1", "user1"), ev_dias1 = c(2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1), var2 = c("ev_hallazgo1___1", "ev_apoyo_dx1___9",
"ev_apoyo_dx1___13", "ev_hallazgo1___4", "ev_costo_oportunidad___1",
"ev_hallazgo1___4", "ev_costo_oportunidad___1", "ev_hallazgo1___1",
"ev_apoyo_dx1___13", "ev_hallazgo1___1", "ev_apoyo_dx1___13"
), var1 = c("Ineficiencia", "Videotelemetria", "Polisomnografia",
"Costo de oportunidad", "Demora tramite ERP", "Costo de oportunidad",
"Demora tramite ERP", "Ineficiencia", "Polisomnografia",
"Ineficiencia", "Polisomnografia"), val1 = c(NA, 70000, 150000,
NA, 450000, NA, 450000, NA, 150000, NA, 150000)), row.names = c(NA,
-11L), class = c("tbl_df", "tbl", "data.frame"))
I require to group by record_id
and redcap_repeat_instance
and place the resulting sum of all the values from val1
next to the equivalent val1
when var2
starts with ev_hallazgo1
Desired output
structure(list(record_id = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L), ev_faudit = structure(c(19037, 19037, 19037, 19038,
19038, 19039, 19039, 19040, 19040, 19041, 19041), class = "Date"),
redcap_repeat_instance = c(41L, 41L, 41L, 42L, 42L, 43L,
43L, 44L, 44L, 45L, 45L), ne_cc = c("119697xxxx ", "119697xxxx ",
"119697xxxx ", "119697xxxx ", "119697xxxx ", "119697xxxx ",
"119697xxxx ", "119697xxxx ", "119697xxxx ", "119697xxxx ",
"119697xxxx "), nom_ips = structure(c(2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L), redcapLabels = c("location1",
"location2"), redcapLevels = 1:2), ne_auditor = c("user1",
"user1", "user1", "user1", "user1",
"user1", "user1", "user1", "user1",
"user1", "user1"), ev_dias1 = c(2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1), var2 = c("ev_hallazgo1___1", "ev_apoyo_dx1___9",
"ev_apoyo_dx1___13", "ev_hallazgo1___4", "ev_costo_oportunidad___1",
"ev_hallazgo1___4", "ev_costo_oportunidad___1", "ev_hallazgo1___1",
"ev_apoyo_dx1___13", "ev_hallazgo1___1", "ev_apoyo_dx1___13"
), var1 = c("Ineficiencia", "Videotelemetria", "Polisomnografia",
"Costo de oportunidad", "Demora tramite ERP", "Costo de oportunidad",
"Demora tramite ERP", "Ineficiencia", "Polisomnografia",
"Ineficiencia", "Polisomnografia"), val1 = c(220000, 70000, 150000,
45000, 450000, 45000, 450000, 150000, 150000, 150000, 150000)), row.names = c(NA,
-11L), class = c("tbl_df", "tbl", "data.frame"))
CodePudding user response:
We can either create the condition on NA
(is.na
) or from the specific substring of 'var1' - grouped by the 'record_id', and 'redcap_repeat_instance', replace
the 'val1' where the 'var2' starts with substring with sum
of all the values in 'val1' for that group
library(dplyr)
library(stringr)
test %>%
group_by(record_id, redcap_repeat_instance) %>%
mutate(val1 = replace(val1, str_detect(var2, "^ev_hallazgo1"),
sum(val1, na.rm = TRUE))) %>%
ungroup
-output
# A tibble: 11 × 10
record_id ev_faudit redcap_repeat_instance ne_cc nom_ips ne_auditor ev_dias1 var2 var1 val1
<int> <date> <int> <chr> <int> <chr> <dbl> <chr> <chr> <dbl>
1 5 2022-02-14 41 "119697xxxx " 2 user1 2 ev_hallazgo1___1 Ineficiencia 220000
2 5 2022-02-14 41 "119697xxxx " 2 user1 2 ev_apoyo_dx1___9 Videotelemetria 70000
3 5 2022-02-14 41 "119697xxxx " 2 user1 2 ev_apoyo_dx1___13 Polisomnografia 150000
4 5 2022-02-15 42 "119697xxxx " 2 user1 1 ev_hallazgo1___4 Costo de oportunidad 450000
5 5 2022-02-15 42 "119697xxxx " 2 user1 1 ev_costo_oportunidad___1 Demora tramite ERP 450000
6 5 2022-02-16 43 "119697xxxx " 2 user1 1 ev_hallazgo1___4 Costo de oportunidad 450000
7 5 2022-02-16 43 "119697xxxx " 2 user1 1 ev_costo_oportunidad___1 Demora tramite ERP 450000
8 5 2022-02-17 44 "119697xxxx " 2 user1 1 ev_hallazgo1___1 Ineficiencia 150000
9 5 2022-02-17 44 "119697xxxx " 2 user1 1 ev_apoyo_dx1___13 Polisomnografia 150000
10 5 2022-02-18 45 "119697xxxx " 2 user1 1 ev_hallazgo1___1 Ineficiencia 150000
11 5 2022-02-18 45 "119697xxxx " 2 user1 1 ev_apoyo_dx1___13 Polisomnografia 150000
If those are the only NA
s, another option is coalesce
test %>%
group_by(record_id, redcap_repeat_instance) %>%
mutate(val1 = coalesce(val1, sum(val1, na.rm = TRUE))) %>%
ungroup