Home > Mobile >  Merging data using left_join based on a condition R
Merging data using left_join based on a condition R

Time:05-04

I have 2 dataframes and I need to merge them based on condition:

# Dataframe 1
kpi_id <- c("SL",  "OOS", "SPA","SL",  "OOS", "SPA")
date <- c("2021-01-01",  "2021-01-01","2021-01-01","2021-02-01","2021-02-01","2021-02-01")
gbu <- c("gbu_1",  "gbu_1","gbu_1","gbu_2","gbu_2","gbu_2")
kpi_val <- c (1,2,3,4,5,6)

df1 <-  data.frame(kpi_id,  date, gbu, kpi_val)
df1

> df1
  kpi_id       date   gbu      kpi_val
1     SL   2021-01-01 gbu_1       1
2    OOS   2021-01-01 gbu_1       2
3    SPA   2021-01-01 gbu_1       3
4     SL   2021-02-01 gbu_2       4
5    OOS   2021-02-01 gbu_2       5
6    SPA   2021-02-01 gbu_2       6

# Dataframe 2
gbu <- c("gbu_1",  "gbu_2")
date <- c("2021-01-01", "2021-02-01")
kpi_sl <- c(7,8)
kpi_oos <- c(9,10)
kpi_spa <- c(11,12)

df2 <-  data.frame(gbu,  date, kpi_sl, kpi_oos, kpi_spa)
df2
> df2
    gbu       date        kpi_sl kpi_oos kpi_spa
1 gbu_1      2021-01-01      7       9      11
2 gbu_2      2021-02-01      8      10      12

I need to create new column kpi_val_joined in df1 values from df2 merging by gbu and date, based on a conditions:

if kpi_id == SL --> kpi_val_joined = kpi_sl
if kpi_id == OOS --> kpi_val_joined = kpi_oos
if kpi_id == SPA --> kpi_val_joined = kpi_spa

# Dataframe 3_result.


  kpi_id       date       gbu     kpi_val    kpi_val_joined
1     SL     2021-01-01   gbu_1       1              7
2    OOS     2021-01-01   gbu_1       2              9
3    SPA     2021-01-01   gbu_1       3             11

4     SL     2021-02-01   gbu_2       4              8
5    OOS     2021-02-01   gbu_2       5             10
6    SPA     2021-02-01   gbu_2       6             12

CodePudding user response:

We can convert df2 to a long format, and capitalize the kpi_id. Then, we can do a simple left_join to get the new column, kpi_val_joined, into df1.

library(tidyverse)

df1 %>%
  left_join(
    .,
    df2 %>%
      pivot_longer(
        starts_with("kpi"),
        names_to = c(NA, "kpi_id"),
        values_to = "kpi_val_joined",
        names_sep = '_'
      ) %>%
      mutate(kpi_id = toupper(kpi_id)),
    by = c("gbu", "date", "kpi_id")
  )

Output

  kpi_id       date   gbu kpi_val kpi_val_joined
1     SL 2021-01-01 gbu_1       1              7
2    OOS 2021-01-01 gbu_1       2              9
3    SPA 2021-01-01 gbu_1       3             11
4     SL 2021-02-01 gbu_2       4              8
5    OOS 2021-02-01 gbu_2       5             10
6    SPA 2021-02-01 gbu_2       6             12

Update

Since we only want to join on specific instances, then we can create a helper_id column in df1 for the conditions that we want to meet. Then, we can join to the long form of df2.

df1 %>%
  mutate(helper_id = case_when(kpi_id == "SL" ~ "kpi_sl",
                               kpi_id == "OOS" ~ "kpi_oos",
                               kpi_id == "SPA" ~ "kpi_spa",
                               TRUE ~ NA_character_)) %>%
  left_join(
    .,
    df2 %>%
      pivot_longer(starts_with("kpi"), names_to = "helper_id", values_to = "kpi_val_joined"),
    by = c("gbu", "date", "helper_id")
  ) %>% 
  select(-helper_id)

CodePudding user response:

No need to do any pivoting; just join on date/gbu, and create the new column using a case_statement, like this:

library(data.table)

setDT(df1)[setDT(df2), on=.(date,gbu), kpi_val_joined:=fcase(
  kpi_id == "SL", kpi_sl,
  kpi_id == "OOS", kpi_oos,
  kpi_id == "SPA", kpi_spa
)]

Output:

   kpi_id       date    gbu kpi_val kpi_val_joined
   <char>     <char> <char>   <num>          <num>
1:     SL 2021-01-01  gbu_1       1              7
2:    OOS 2021-01-01  gbu_1       2              9
3:    SPA 2021-01-01  gbu_1       3             11
4:     SL 2021-02-01  gbu_2       4              8
5:    OOS 2021-02-01  gbu_2       5             10
6:    SPA 2021-02-01  gbu_2       6             12

If you want you can also do this with tidyverse, no pivoting necessary.

inner_join(df1,df2, by=c("date","gbu")) %>% 
  mutate(kpi_val_joined = case_when(
    kpi_id == "SL"~kpi_sl,
    kpi_id == "OOS"~kpi_oos,
    kpi_id == "SPA"~kpi_spa  
  )) %>% 
  select(-(kpi_sl:kpi_spa))
  • Related