Home > Mobile >  Multiple Condition and Inequality Join
Multiple Condition and Inequality Join

Time:07-11

I've the following code. This code has been in SQL. I want to replicate the same code in R (prefereably dplyr). I found ways to join using inequality conditions but not AND and OR condition. Can someone please guide me?

library(sqldf)


start_date <- c('2022-01-01','2022-02-01','2022-03-01','2022-04-01','2022-05-01','2022-06-01')
end_date <- c('2022-01-31','2022-02-28',NA,NA,'2022-05-31','2022-06-30')

dat1 <- data.frame(start_date, end_date)

date_col <- c('2022-01-15','2022-02-02','2022-03-17','2022-04-22','2022-05-14','2022-06-09')

dat_col <- data.frame(date_col)

final_dat <- sqldf("SELECT * FROM dat_col a LEFT JOIN dat1 b on b.start_date<= a.date_col AND (b.end_date < a.date_col 
                   OR b.end_date IS NULL)")

CodePudding user response:

A possible solution, based on power_left_join:

library(powerjoin)
library(lubridate)

power_left_join(
  dat_col, dat1, 
  by = c(~ ymd(.y$start_date) <= ymd(.x$date_col) &
       (ymd(.y$end_date) < ymd(.x$date_col) | 
       is.na(.y$end_date))))

#>      date_col start_date   end_date
#> 1  2022-01-15       <NA>       <NA>
#> 2  2022-02-02 2022-01-01 2022-01-31
#> 3  2022-03-17 2022-01-01 2022-01-31
#> 4  2022-03-17 2022-02-01 2022-02-28
#> 5  2022-03-17 2022-03-01       <NA>
#> 6  2022-04-22 2022-01-01 2022-01-31
#> 7  2022-04-22 2022-02-01 2022-02-28
#> 8  2022-04-22 2022-03-01       <NA>
#> 9  2022-04-22 2022-04-01       <NA>
#> 10 2022-05-14 2022-01-01 2022-01-31
#> 11 2022-05-14 2022-02-01 2022-02-28
#> 12 2022-05-14 2022-03-01       <NA>
#> 13 2022-05-14 2022-04-01       <NA>
#> 14 2022-06-09 2022-01-01 2022-01-31
#> 15 2022-06-09 2022-02-01 2022-02-28
#> 16 2022-06-09 2022-03-01       <NA>
#> 17 2022-06-09 2022-04-01       <NA>
#> 18 2022-06-09 2022-05-01 2022-05-31

CodePudding user response:

probably the simplest solution is to merge by nothing then filter

dat_col |> 
  merge(dat1, by = NULL) |> 
  filter(start_date <= date_col, (end_date < date_col | is.na(end_date))) |> 
  arrange(-desc(date_col))
     date_col start_date   end_date
1  2022-02-02 2022-01-01 2022-01-31
2  2022-03-17 2022-01-01 2022-01-31
3  2022-03-17 2022-02-01 2022-02-28
4  2022-03-17 2022-03-01       <NA>
5  2022-04-22 2022-01-01 2022-01-31
6  2022-04-22 2022-02-01 2022-02-28
7  2022-04-22 2022-03-01       <NA>
8  2022-04-22 2022-04-01       <NA>
9  2022-05-14 2022-01-01 2022-01-31
10 2022-05-14 2022-02-01 2022-02-28
11 2022-05-14 2022-03-01       <NA>
12 2022-05-14 2022-04-01       <NA>
13 2022-06-09 2022-01-01 2022-01-31
14 2022-06-09 2022-02-01 2022-02-28
15 2022-06-09 2022-03-01       <NA>
16 2022-06-09 2022-04-01       <NA>
17 2022-06-09 2022-05-01 2022-05-31

CodePudding user response:

Another option using merge in one pipe:

library(dplyr)
library(lubridate)
dat_col %>%
  mutate(across(.fns = ymd)) %>%
  merge(., dat1 %>% mutate(across(.fns = ymd)), all.x = TRUE) %>%
  filter(start_date <= date_col & (end_date < date_col | is.na(end_date))) %>%
  arrange(date_col)
#>      date_col start_date   end_date
#> 1  2022-02-02 2022-01-01 2022-01-31
#> 2  2022-03-17 2022-01-01 2022-01-31
#> 3  2022-03-17 2022-02-01 2022-02-28
#> 4  2022-03-17 2022-03-01       <NA>
#> 5  2022-04-22 2022-01-01 2022-01-31
#> 6  2022-04-22 2022-02-01 2022-02-28
#> 7  2022-04-22 2022-03-01       <NA>
#> 8  2022-04-22 2022-04-01       <NA>
#> 9  2022-05-14 2022-01-01 2022-01-31
#> 10 2022-05-14 2022-02-01 2022-02-28
#> 11 2022-05-14 2022-03-01       <NA>
#> 12 2022-05-14 2022-04-01       <NA>
#> 13 2022-06-09 2022-01-01 2022-01-31
#> 14 2022-06-09 2022-02-01 2022-02-28
#> 15 2022-06-09 2022-03-01       <NA>
#> 16 2022-06-09 2022-04-01       <NA>
#> 17 2022-06-09 2022-05-01 2022-05-31

Created on 2022-07-10 by the reprex package (v2.0.1)

CodePudding user response:

Does this work for you?

library(dplyr)
library(lubridate)

start_date <- c('2022-01-01','2022-02-01','2022-03-01','2022-04-01','2022-05-01','2022-06-01')
end_date <- c('2022-01-31','2022-02-28',NA,NA,'2022-05-31','2022-06-30')
dat1 <- data.frame(start_date, end_date)

date_col <- c('2022-01-15','2022-02-02','2022-03-17','2022-04-22','2022-05-14','2022-06-09')
dat_col <- data.frame(date_col)

# changing columns to date -----------------------------------

dat_col <- dat_col %>% mutate(date_col = ymd(date_col))
at1 <- dat1 %>% mutate(across(.fns = ymd))

# joining ----------------------------------------------------

dat_col %>% 
  mutate(placeholder = 1) %>% # creating this just for joining
  left_join(dat1 %>% mutate(placeholder = 1)) %>% 
  filter(start_date <= date_col & (end_date < date_col | is.na(end_date))) %>% 
  select(-placeholder)

#> Joining, by = "placeholder"
#>      date_col start_date   end_date
#> 1  2022-02-02 2022-01-01 2022-01-31
#> 2  2022-03-17 2022-01-01 2022-01-31
#> 3  2022-03-17 2022-02-01 2022-02-28
#> 4  2022-03-17 2022-03-01       <NA>
#> 5  2022-04-22 2022-01-01 2022-01-31
#> 6  2022-04-22 2022-02-01 2022-02-28
#> 7  2022-04-22 2022-03-01       <NA>
#> 8  2022-04-22 2022-04-01       <NA>
#> 9  2022-05-14 2022-01-01 2022-01-31
#> 10 2022-05-14 2022-02-01 2022-02-28
#> 11 2022-05-14 2022-03-01       <NA>
#> 12 2022-05-14 2022-04-01       <NA>
#> 13 2022-06-09 2022-01-01 2022-01-31
#> 14 2022-06-09 2022-02-01 2022-02-28
#> 15 2022-06-09 2022-03-01       <NA>
#> 16 2022-06-09 2022-04-01       <NA>
#> 17 2022-06-09 2022-05-01 2022-05-31

Created on 2022-07-10 by the reprex package (v2.0.1)

  • Related