I have a data.frame df1
. Some selected rows have been manually reviewed and updated, creating a second data.frame df1updated
which has all the same columns, where some of the data has been changed, plus additional columns.
I want to join the updated version to the original and, where the data has been change, replace the original, where there has been no change, retain the original, where the data has not been reviewed (i.e. is not in df1updated
) retain the original
I have done this in this small example as follows:
library(lubridate)
library(dplyr)
library(tidyr)
df1 = data.frame(id = c(1,2,3,4,5),
date = dmy(c("15/01/2020", "03/12/2020", "20/08/2019" , "01/01/2021", "01/02/2021")),
type = c("type_A","type_A", "type_B", "type_C", "type_B"))
df1_update = data.frame(id = c(1,2,3),
date = dmy(c("25/01/2020", "03/12/2020", "20/08/2019")),
type = c("type_A","type_B", "type_B"),
new_info = c("note", "nil","note"))
df3 = left_join(df1, df1_update, by = "id")%>%
mutate(date = case_when(is.na(date.y) ~ date.x,
date.x == date.y ~ date.x,
date.x != date.y ~ date.y),
type = case_when(is.na(type.y) ~type.x,
type.x == type.y ~ type.x,
type.x != type.y ~ type.y))%>%
select(-contains(c(".x", ".y")))
df3
> df3
id new_info date type
1 1 note 2020-01-25 type_A
2 2 nil 2020-12-03 type_B
3 3 note 2019-08-20 type_B
4 4 <NA> 2021-01-01 type_C
5 5 <NA> 2021-02-01 type_B
In my real data set I have around 16 columns that have been reviewed and updated. Is it possible to compare all pairs of columns ending in .x and .y without having to name each pair as I have above? I'm guessing it may be possible by writing a function
CodePudding user response:
It may be easier with coalesce
(if there are not much conditions or else can use case_when
). In addition, assuming that there are always .y
columns for the corresponding .x
column, loop across
the .x
columns, replace the substring .x
of the column name (cur_column()
) with .y
, get
the value, apply case_when
, update the column name within .name
and remove the unused
columns using .keep
library(dplyr)
library(stringr)
left_join(df1, df1_update, by = "id") %>%
mutate(across(ends_with('.x'),
~ {ydat <- get(str_replace(cur_column(), '.x', '.y'))
case_when(is.na(ydat) ~ .x,
.x == ydat ~ .x,
.x != ydat ~ ydat)
},
.names = "{str_remove(.col, '.x')}"), .keep = 'unused')
-output
id new_info date type
1 1 note 2020-01-25 type_A
2 2 nil 2020-12-03 type_B
3 3 note 2019-08-20 type_B
4 4 <NA> 2021-01-01 type_C
5 5 <NA> 2021-02-01 type_B
CodePudding user response:
Another way with a function:
library(dplyr)
library(purrr)
coalesce_from_base <- function(df, base) {
.x <- paste0(base, ".x")
.y <- paste0(base, ".y")
df %>%
mutate(!!ensym(base) := case_when(is.na(.data[[.y]]) ~ .data[[.x]],
.data[[.x]] == .data[[.y]] ~ .data[[.x]],
.data[[.x]] != .data[[.y]] ~ .data[[.y]]))
}
# join together
df3 <- left_join(df1, df1_update, by = "id")
# create a vector a fields to iterate over
col_base <- c("date", "type")
# col_base <- stringr::str_subset(names(df3), ".x$") # create this by code
# use reduce to cumulative iterate over each base value
reduce(col_base, coalesce_from_base, .init = df3) %>%
select(-ends_with(c(".x", ".y")))