I have 2 dataframes and I need to merge them based on condition:
# Dataframe 1
kpi_id <- c("SL", "OOS", "SPA","SL", "OOS", "SPA")
date <- c("2021-01-01", "2021-01-01","2021-01-01","2021-02-01","2021-02-01","2021-02-01")
gbu <- c("gbu_1", "gbu_1","gbu_1","gbu_2","gbu_2","gbu_2")
kpi_val <- c (1,2,3,4,5,6)
df1 <- data.frame(kpi_id, date, gbu, kpi_val)
df1
> df1
kpi_id date gbu kpi_val
1 SL 2021-01-01 gbu_1 1
2 OOS 2021-01-01 gbu_1 2
3 SPA 2021-01-01 gbu_1 3
4 SL 2021-02-01 gbu_2 4
5 OOS 2021-02-01 gbu_2 5
6 SPA 2021-02-01 gbu_2 6
# Dataframe 2
gbu <- c("gbu_1", "gbu_2")
date <- c("2021-01-01", "2021-02-01")
kpi_sl <- c(7,8)
kpi_oos <- c(9,10)
kpi_spa <- c(11,12)
df2 <- data.frame(gbu, date, kpi_sl, kpi_oos, kpi_spa)
df2
> df2
gbu date kpi_sl kpi_oos kpi_spa
1 gbu_1 2021-01-01 7 9 11
2 gbu_2 2021-02-01 8 10 12
I need to create new column kpi_val_joined
in df1
values from df2
merging by gbu
and date
, based on a conditions:
if kpi_id == SL --> kpi_val_joined = kpi_sl
if kpi_id == OOS --> kpi_val_joined = kpi_oos
if kpi_id == SPA --> kpi_val_joined = kpi_spa
# Dataframe 3_result.
kpi_id date gbu kpi_val kpi_val_joined
1 SL 2021-01-01 gbu_1 1 7
2 OOS 2021-01-01 gbu_1 2 9
3 SPA 2021-01-01 gbu_1 3 11
4 SL 2021-02-01 gbu_2 4 8
5 OOS 2021-02-01 gbu_2 5 10
6 SPA 2021-02-01 gbu_2 6 12
CodePudding user response:
We can convert df2
to a long format, and capitalize the kpi_id
. Then, we can do a simple left_join
to get the new column, kpi_val_joined
, into df1
.
library(tidyverse)
df1 %>%
left_join(
.,
df2 %>%
pivot_longer(
starts_with("kpi"),
names_to = c(NA, "kpi_id"),
values_to = "kpi_val_joined",
names_sep = '_'
) %>%
mutate(kpi_id = toupper(kpi_id)),
by = c("gbu", "date", "kpi_id")
)
Output
kpi_id date gbu kpi_val kpi_val_joined
1 SL 2021-01-01 gbu_1 1 7
2 OOS 2021-01-01 gbu_1 2 9
3 SPA 2021-01-01 gbu_1 3 11
4 SL 2021-02-01 gbu_2 4 8
5 OOS 2021-02-01 gbu_2 5 10
6 SPA 2021-02-01 gbu_2 6 12
Update
Since we only want to join on specific instances, then we can create a helper_id
column in df1
for the conditions that we want to meet. Then, we can join to the long form of df2
.
df1 %>%
mutate(helper_id = case_when(kpi_id == "SL" ~ "kpi_sl",
kpi_id == "OOS" ~ "kpi_oos",
kpi_id == "SPA" ~ "kpi_spa",
TRUE ~ NA_character_)) %>%
left_join(
.,
df2 %>%
pivot_longer(starts_with("kpi"), names_to = "helper_id", values_to = "kpi_val_joined"),
by = c("gbu", "date", "helper_id")
) %>%
select(-helper_id)
CodePudding user response:
No need to do any pivoting; just join on date/gbu, and create the new column using a case_statement, like this:
library(data.table)
setDT(df1)[setDT(df2), on=.(date,gbu), kpi_val_joined:=fcase(
kpi_id == "SL", kpi_sl,
kpi_id == "OOS", kpi_oos,
kpi_id == "SPA", kpi_spa
)]
Output:
kpi_id date gbu kpi_val kpi_val_joined
<char> <char> <char> <num> <num>
1: SL 2021-01-01 gbu_1 1 7
2: OOS 2021-01-01 gbu_1 2 9
3: SPA 2021-01-01 gbu_1 3 11
4: SL 2021-02-01 gbu_2 4 8
5: OOS 2021-02-01 gbu_2 5 10
6: SPA 2021-02-01 gbu_2 6 12
If you want you can also do this with tidyverse, no pivoting necessary.
inner_join(df1,df2, by=c("date","gbu")) %>%
mutate(kpi_val_joined = case_when(
kpi_id == "SL"~kpi_sl,
kpi_id == "OOS"~kpi_oos,
kpi_id == "SPA"~kpi_spa
)) %>%
select(-(kpi_sl:kpi_spa))