I've the following code. This code has been in SQL. I want to replicate the same code in R (prefereably dplyr
). I found ways to join using inequality conditions but not AND
and OR
condition. Can someone please guide me?
library(sqldf)
start_date <- c('2022-01-01','2022-02-01','2022-03-01','2022-04-01','2022-05-01','2022-06-01')
end_date <- c('2022-01-31','2022-02-28',NA,NA,'2022-05-31','2022-06-30')
dat1 <- data.frame(start_date, end_date)
date_col <- c('2022-01-15','2022-02-02','2022-03-17','2022-04-22','2022-05-14','2022-06-09')
dat_col <- data.frame(date_col)
final_dat <- sqldf("SELECT * FROM dat_col a LEFT JOIN dat1 b on b.start_date<= a.date_col AND (b.end_date < a.date_col
OR b.end_date IS NULL)")
CodePudding user response:
A possible solution, based on power_left_join
:
library(powerjoin)
library(lubridate)
power_left_join(
dat_col, dat1,
by = c(~ ymd(.y$start_date) <= ymd(.x$date_col) &
(ymd(.y$end_date) < ymd(.x$date_col) |
is.na(.y$end_date))))
#> date_col start_date end_date
#> 1 2022-01-15 <NA> <NA>
#> 2 2022-02-02 2022-01-01 2022-01-31
#> 3 2022-03-17 2022-01-01 2022-01-31
#> 4 2022-03-17 2022-02-01 2022-02-28
#> 5 2022-03-17 2022-03-01 <NA>
#> 6 2022-04-22 2022-01-01 2022-01-31
#> 7 2022-04-22 2022-02-01 2022-02-28
#> 8 2022-04-22 2022-03-01 <NA>
#> 9 2022-04-22 2022-04-01 <NA>
#> 10 2022-05-14 2022-01-01 2022-01-31
#> 11 2022-05-14 2022-02-01 2022-02-28
#> 12 2022-05-14 2022-03-01 <NA>
#> 13 2022-05-14 2022-04-01 <NA>
#> 14 2022-06-09 2022-01-01 2022-01-31
#> 15 2022-06-09 2022-02-01 2022-02-28
#> 16 2022-06-09 2022-03-01 <NA>
#> 17 2022-06-09 2022-04-01 <NA>
#> 18 2022-06-09 2022-05-01 2022-05-31
CodePudding user response:
probably the simplest solution is to merge by nothing then filter
dat_col |>
merge(dat1, by = NULL) |>
filter(start_date <= date_col, (end_date < date_col | is.na(end_date))) |>
arrange(-desc(date_col))
date_col start_date end_date
1 2022-02-02 2022-01-01 2022-01-31
2 2022-03-17 2022-01-01 2022-01-31
3 2022-03-17 2022-02-01 2022-02-28
4 2022-03-17 2022-03-01 <NA>
5 2022-04-22 2022-01-01 2022-01-31
6 2022-04-22 2022-02-01 2022-02-28
7 2022-04-22 2022-03-01 <NA>
8 2022-04-22 2022-04-01 <NA>
9 2022-05-14 2022-01-01 2022-01-31
10 2022-05-14 2022-02-01 2022-02-28
11 2022-05-14 2022-03-01 <NA>
12 2022-05-14 2022-04-01 <NA>
13 2022-06-09 2022-01-01 2022-01-31
14 2022-06-09 2022-02-01 2022-02-28
15 2022-06-09 2022-03-01 <NA>
16 2022-06-09 2022-04-01 <NA>
17 2022-06-09 2022-05-01 2022-05-31
CodePudding user response:
Another option using merge
in one pipe:
library(dplyr)
library(lubridate)
dat_col %>%
mutate(across(.fns = ymd)) %>%
merge(., dat1 %>% mutate(across(.fns = ymd)), all.x = TRUE) %>%
filter(start_date <= date_col & (end_date < date_col | is.na(end_date))) %>%
arrange(date_col)
#> date_col start_date end_date
#> 1 2022-02-02 2022-01-01 2022-01-31
#> 2 2022-03-17 2022-01-01 2022-01-31
#> 3 2022-03-17 2022-02-01 2022-02-28
#> 4 2022-03-17 2022-03-01 <NA>
#> 5 2022-04-22 2022-01-01 2022-01-31
#> 6 2022-04-22 2022-02-01 2022-02-28
#> 7 2022-04-22 2022-03-01 <NA>
#> 8 2022-04-22 2022-04-01 <NA>
#> 9 2022-05-14 2022-01-01 2022-01-31
#> 10 2022-05-14 2022-02-01 2022-02-28
#> 11 2022-05-14 2022-03-01 <NA>
#> 12 2022-05-14 2022-04-01 <NA>
#> 13 2022-06-09 2022-01-01 2022-01-31
#> 14 2022-06-09 2022-02-01 2022-02-28
#> 15 2022-06-09 2022-03-01 <NA>
#> 16 2022-06-09 2022-04-01 <NA>
#> 17 2022-06-09 2022-05-01 2022-05-31
Created on 2022-07-10 by the reprex package (v2.0.1)
CodePudding user response:
Does this work for you?
library(dplyr)
library(lubridate)
start_date <- c('2022-01-01','2022-02-01','2022-03-01','2022-04-01','2022-05-01','2022-06-01')
end_date <- c('2022-01-31','2022-02-28',NA,NA,'2022-05-31','2022-06-30')
dat1 <- data.frame(start_date, end_date)
date_col <- c('2022-01-15','2022-02-02','2022-03-17','2022-04-22','2022-05-14','2022-06-09')
dat_col <- data.frame(date_col)
# changing columns to date -----------------------------------
dat_col <- dat_col %>% mutate(date_col = ymd(date_col))
at1 <- dat1 %>% mutate(across(.fns = ymd))
# joining ----------------------------------------------------
dat_col %>%
mutate(placeholder = 1) %>% # creating this just for joining
left_join(dat1 %>% mutate(placeholder = 1)) %>%
filter(start_date <= date_col & (end_date < date_col | is.na(end_date))) %>%
select(-placeholder)
#> Joining, by = "placeholder"
#> date_col start_date end_date
#> 1 2022-02-02 2022-01-01 2022-01-31
#> 2 2022-03-17 2022-01-01 2022-01-31
#> 3 2022-03-17 2022-02-01 2022-02-28
#> 4 2022-03-17 2022-03-01 <NA>
#> 5 2022-04-22 2022-01-01 2022-01-31
#> 6 2022-04-22 2022-02-01 2022-02-28
#> 7 2022-04-22 2022-03-01 <NA>
#> 8 2022-04-22 2022-04-01 <NA>
#> 9 2022-05-14 2022-01-01 2022-01-31
#> 10 2022-05-14 2022-02-01 2022-02-28
#> 11 2022-05-14 2022-03-01 <NA>
#> 12 2022-05-14 2022-04-01 <NA>
#> 13 2022-06-09 2022-01-01 2022-01-31
#> 14 2022-06-09 2022-02-01 2022-02-28
#> 15 2022-06-09 2022-03-01 <NA>
#> 16 2022-06-09 2022-04-01 <NA>
#> 17 2022-06-09 2022-05-01 2022-05-31
Created on 2022-07-10 by the reprex package (v2.0.1)