Home > database >  Join two dataframes by the most closest datetime
Join two dataframes by the most closest datetime

Time:09-14

I would like to join these 2 data frames by the closest time of df1 to df2. Keeping all datetime present on df2 and the column validated filled with the name "coral_sol" and column N filled with NA.

df1 <- structure(list(validated = c("coral_sol", "coral_sol", "coral_sol", 
"coral_sol", "coral_sol", "coral_sol", "coral_sol", "coral_sol", 
"coral_sol", "coral_sol"), cycle = structure(c(1606782894, 1606786502, 
1606790113, 1606793721, 1606797332, 1606800941, 1606804550, 1606808160, 
1606808160, 1606845846), tzone = "UTC", class = c("POSIXct", 
"POSIXt")), N = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L)), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -10L), .internal.selfref = <pointer: 0x0000028e0d4d8f30>, groups = structure(list(
    cycle = structure(c(1606782894, 1606786502, 1606790113, 1606793721, 
    1606797332, 1606800941, 1606804550, 1606808160, 1606845846
    ), tzone = "UTC", class = c("POSIXct", "POSIXt")), .rows = structure(list(
        1L, 2L, 3L, 4L, 5L, 6L, 7L, 8:9, 10L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -9L), .drop = TRUE))

df2 <- structure(list(cycle = structure(c(1606780800, 1606782600, 1606784400, 
1606786200, 1606788000, 1606789800, 1606791600, 1606793400, 1606795200, 
1606797000, 1606798800, 1606800600, 1606802400, 1606804200, 1606806000, 
1606807800, 1606809600, 1606811400, 1606813200, 1606815000, 1606816800, 
1606818600, 1606820400, 1606822200, 1606824000, 1606825800, 1606827600, 
1606829400, 1606831200, 1606833000, 1606834800, 1606836600, 1606838400, 
1606840200, 1606842000, 1606843800, 1606845600, 1606847400, 1606849200, 
1606851000), tzone = "UTC", class = c("POSIXct", "POSIXt"))), row.names = c(NA, 
-40L), class = c("tbl_df", "tbl", "data.frame"))

Thanks any help

CodePudding user response:

Have you considered the fuzzyjoin package? https://cran.rstudio.com/web/packages/fuzzyjoin/

CodePudding user response:

One way is to first generate a helper column, e.g. __cycle_join (make sure it's not existing) in both frames to merge by, finding the closest matching date (using the first match if more than one matching date is found).

matches <- sapply( df1$cycle, function(x) which.min(abs(x - df2$cycle)) )
df1$"__cycle_join" <- matches
df2$"__cycle_join" <- 0
df2$"__cycle_join"[matches] <- matches

Then merge using right_join, also removing helper columns and sorting by column cycle.

library(dplyr)

right_join(df1, df2, by="__cycle_join", suffix = c(".x", "")) %>%
  mutate(validated = ifelse(is.na(validated), "coral_sol", validated), 
    "__cycle_join" = NULL, cycle.x = NULL) %>%
  arrange(cycle)
# A tibble: 41 × 3
# Groups:   cycle [40]
   validated     N cycle              
   <chr>     <int> <dttm>             
 1 coral_sol    NA 2020-12-01 00:00:00
 2 coral_sol     0 2020-12-01 00:30:00
 3 coral_sol    NA 2020-12-01 01:00:00
 4 coral_sol     0 2020-12-01 01:30:00
 5 coral_sol    NA 2020-12-01 02:00:00
 6 coral_sol     0 2020-12-01 02:30:00
 7 coral_sol    NA 2020-12-01 03:00:00
 8 coral_sol     0 2020-12-01 03:30:00
 9 coral_sol    NA 2020-12-01 04:00:00
10 coral_sol     0 2020-12-01 04:30:00
# … with 31 more rows

Data

df1 <- structure(list(validated = c("coral_sol", "coral_sol", "coral_sol", 
"coral_sol", "coral_sol", "coral_sol", "coral_sol", "coral_sol", 
"coral_sol", "coral_sol"), cycle = structure(c(1606782894, 1606786502, 
1606790113, 1606793721, 1606797332, 1606800941, 1606804550, 1606808160, 
1606808160, 1606845846), tzone = "UTC", class = c("POSIXct", 
"POSIXt")), N = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L)), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -10L), groups = structure(list(
    cycle = structure(c(1606782894, 1606786502, 1606790113, 1606793721, 
    1606797332, 1606800941, 1606804550, 1606808160, 1606845846
    ), tzone = "UTC", class = c("POSIXct", "POSIXt")), .rows = structure(list(
        1L, 2L, 3L, 4L, 5L, 6L, 7L, 8:9, 10L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -9L), .drop = TRUE))

df2 <- structure(list(cycle = structure(c(1606780800, 1606782600, 1606784400, 
1606786200, 1606788000, 1606789800, 1606791600, 1606793400, 1606795200, 
1606797000, 1606798800, 1606800600, 1606802400, 1606804200, 1606806000, 
1606807800, 1606809600, 1606811400, 1606813200, 1606815000, 1606816800, 
1606818600, 1606820400, 1606822200, 1606824000, 1606825800, 1606827600, 
1606829400, 1606831200, 1606833000, 1606834800, 1606836600, 1606838400, 
1606840200, 1606842000, 1606843800, 1606845600, 1606847400, 1606849200, 
1606851000), tzone = "UTC", class = c("POSIXct", "POSIXt"))), row.names = c(NA, 
-40L), class = c("tbl_df", "tbl", "data.frame"))
  • Related