I would like to join these 2 data frames by the closest time of df1 to df2. Keeping all datetime present on df2 and the column validated filled with the name "coral_sol" and column N filled with NA.
df1 <- structure(list(validated = c("coral_sol", "coral_sol", "coral_sol",
"coral_sol", "coral_sol", "coral_sol", "coral_sol", "coral_sol",
"coral_sol", "coral_sol"), cycle = structure(c(1606782894, 1606786502,
1606790113, 1606793721, 1606797332, 1606800941, 1606804550, 1606808160,
1606808160, 1606845846), tzone = "UTC", class = c("POSIXct",
"POSIXt")), N = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -10L), .internal.selfref = <pointer: 0x0000028e0d4d8f30>, groups = structure(list(
cycle = structure(c(1606782894, 1606786502, 1606790113, 1606793721,
1606797332, 1606800941, 1606804550, 1606808160, 1606845846
), tzone = "UTC", class = c("POSIXct", "POSIXt")), .rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8:9, 10L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -9L), .drop = TRUE))
df2 <- structure(list(cycle = structure(c(1606780800, 1606782600, 1606784400,
1606786200, 1606788000, 1606789800, 1606791600, 1606793400, 1606795200,
1606797000, 1606798800, 1606800600, 1606802400, 1606804200, 1606806000,
1606807800, 1606809600, 1606811400, 1606813200, 1606815000, 1606816800,
1606818600, 1606820400, 1606822200, 1606824000, 1606825800, 1606827600,
1606829400, 1606831200, 1606833000, 1606834800, 1606836600, 1606838400,
1606840200, 1606842000, 1606843800, 1606845600, 1606847400, 1606849200,
1606851000), tzone = "UTC", class = c("POSIXct", "POSIXt"))), row.names = c(NA,
-40L), class = c("tbl_df", "tbl", "data.frame"))
Thanks any help
CodePudding user response:
Have you considered the fuzzyjoin package? https://cran.rstudio.com/web/packages/fuzzyjoin/
CodePudding user response:
One way is to first generate a helper column, e.g. __cycle_join
(make sure it's not existing) in both frames to merge by, finding the closest matching date
(using the first match if more than one matching date is found).
matches <- sapply( df1$cycle, function(x) which.min(abs(x - df2$cycle)) )
df1$"__cycle_join" <- matches
df2$"__cycle_join" <- 0
df2$"__cycle_join"[matches] <- matches
Then merge using right_join
, also removing helper columns and sorting by column cycle
.
library(dplyr)
right_join(df1, df2, by="__cycle_join", suffix = c(".x", "")) %>%
mutate(validated = ifelse(is.na(validated), "coral_sol", validated),
"__cycle_join" = NULL, cycle.x = NULL) %>%
arrange(cycle)
# A tibble: 41 × 3
# Groups: cycle [40]
validated N cycle
<chr> <int> <dttm>
1 coral_sol NA 2020-12-01 00:00:00
2 coral_sol 0 2020-12-01 00:30:00
3 coral_sol NA 2020-12-01 01:00:00
4 coral_sol 0 2020-12-01 01:30:00
5 coral_sol NA 2020-12-01 02:00:00
6 coral_sol 0 2020-12-01 02:30:00
7 coral_sol NA 2020-12-01 03:00:00
8 coral_sol 0 2020-12-01 03:30:00
9 coral_sol NA 2020-12-01 04:00:00
10 coral_sol 0 2020-12-01 04:30:00
# … with 31 more rows
Data
df1 <- structure(list(validated = c("coral_sol", "coral_sol", "coral_sol",
"coral_sol", "coral_sol", "coral_sol", "coral_sol", "coral_sol",
"coral_sol", "coral_sol"), cycle = structure(c(1606782894, 1606786502,
1606790113, 1606793721, 1606797332, 1606800941, 1606804550, 1606808160,
1606808160, 1606845846), tzone = "UTC", class = c("POSIXct",
"POSIXt")), N = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -10L), groups = structure(list(
cycle = structure(c(1606782894, 1606786502, 1606790113, 1606793721,
1606797332, 1606800941, 1606804550, 1606808160, 1606845846
), tzone = "UTC", class = c("POSIXct", "POSIXt")), .rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8:9, 10L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -9L), .drop = TRUE))
df2 <- structure(list(cycle = structure(c(1606780800, 1606782600, 1606784400,
1606786200, 1606788000, 1606789800, 1606791600, 1606793400, 1606795200,
1606797000, 1606798800, 1606800600, 1606802400, 1606804200, 1606806000,
1606807800, 1606809600, 1606811400, 1606813200, 1606815000, 1606816800,
1606818600, 1606820400, 1606822200, 1606824000, 1606825800, 1606827600,
1606829400, 1606831200, 1606833000, 1606834800, 1606836600, 1606838400,
1606840200, 1606842000, 1606843800, 1606845600, 1606847400, 1606849200,
1606851000), tzone = "UTC", class = c("POSIXct", "POSIXt"))), row.names = c(NA,
-40L), class = c("tbl_df", "tbl", "data.frame"))