I have two dataframes which I want to merge by nearest date. There´re often the same date of df2 which fits to df1. Also there´re values of df2 which doesn´t fit to df1 at all. After merging I want to know the time difference between the merging dates. For example ship Sally fits to two Peaks, ship Carl to no one. I tried different solutions e.g. with data.table roll='nearest' function
df1 <- data.frame(date = (c("23.11.2021 20:56:06", "23.11.2021 20:56:07","23.11.2021 20:56:08",
"23.11.2021 20:56:09","23.11.2021 20:56:10",
"23.11.2021 20:56:11", "23.11.2021 22:13:56",
"23.11.2021 22:13:57", "23.11.2021 22:13:58",
"23.11.2021 22:13:59", "24.11.2021 03:23:21",
"24.11.2021 03:23:22", "24.11.2021 03:23:23",
"24.11.2021 03:23:24", "24.11.2021 03:23:25",
"24.11.2021 03:24:34", "24.11.2021 03:24:35",
"24.11.2021 03:24:36", "24.11.2021 03:24:37")),
value = (c(500, 900, 1000, 200, 300, 10, 1000, 450, 950, 600,
100, 750, 150, 200, 300, 400, 900, 1020, 800)))
df2 <- data.frame(date = (c("23.11.2021 20:55:47", "23.11.2021 21:17:48",
"23.11.2021 21:44:19", "23.11.2021 21:55:57",
"23.11.2021 22:16:01","23.11.2021 22:26:35",
"23.11.2021 22:28:43", "23.11.2021 23:14:52",
"23.11.2021 23:32:30","23.11.2021 23:33:43",
"24.11.2021 03:11:30", "24.11.2021 03:23:37",
"24.11.2021 04:42:41", "24.11.2021 04:53:09",
"24.11.2021 04:58:19")),
value = (c("Ship Emma", "Ship Carl", "Ship Rudi", "Ship Tim", "Ship Amy",
"Ship Fred", "Ship Noemi"
, "Ship Fanny", "Ship Ole", "Ship Ally",
"Ship Hugo", "Ship Sally", "Ship Peter", "Ship Harry", "Ship Piet")))
result <- data.frame(date = (c("23.11.2021 20:56:06", "23.11.2021 20:56:07",
"23.11.2021 20:56:08", "23.11.2021 20:56:09",
"23.11.2021 20:56:10","23.11.2021 20:56:11",
"23.11.2021 22:13:56", "23.11.2021 22:13:57",
"23.11.2021 22:13:58", "23.11.2021 22:13:59",
"24.11.2021 03:23:21", "24.11.2021 03:23:22",
"24.11.2021 03:23:23", "24.11.2021 03:23:24",
"24.11.2021 03:23:25", "24.11.2021 03:24:34",
"24.11.2021 03:24:35", "24.11.2021 03:24:36",
"24.11.2021 03:24:37")),
value = (c(500, 900, 1000, 200, 300, 10, 1000, 450, 950, 600,
100, 750, 150, 200, 300, 400, 900, 1020, 800)),
id = (c("Ship Emma", "Ship Emma", "Ship Emma", "Ship Emma", "Ship Emma",
"Ship Emma", "Ship Amy", "Ship Amy", "Ship Amy", "Ship Amy",
"Ship Sally", "Ship Sally", "Ship Sally", "Ship Sally",
"Ship Sally", "Ship Sally", "Ship Sally", "Ship Sally","Ship Sally")),
difference = (c("00:00:19", "00:00:20", "00:00:21", "00:00:22", "00:00:23",
"00:00:24", "00:00:05", "00:00:04", "00:00:03", "00:00:02",
"00:00:16", "00:00:15", "00:00:14", "00:00:13", "00:00:12",
"00:00:57", "00:00:58", "00:00:59", "00:01:00")))
df1$date<- as.POSIXct(df1$date, tz = "GMT", format = "%d.%m.%Y %H:%M:%S")
df2$date<- as.POSIXct(df2$date, tz = "GMT", format = "%d.%m.%Y %H:%M:%S")
result$date<- as.POSIXct(result$date, tz = "GMT", format = "%d.%m.%Y %H:%M:%S")
CodePudding user response:
I suggest two approaches. The first uses a distance matrix and perform a left_join of df1 to df2. Namely the distance matrix is given by:
dateDist <- outer(pull(df1, date), pull(df2, date), "-") %>%
abs()
Next, for each row of df1
, the row of df2
with closest distance is given by:
closest.df1 <- apply(dateDist, 1, which.min)
Finally, the merge is performed manually:
cbind(rename_with(df1, ~paste0("df1.", "", .x)),
rename_with(df2[closest.df1,], ~paste0("df2.", "", .x)))
##> df1.date df1.value df2.date df2.value
##>1 2021-11-23 20:56:06 500 2021-11-23 20:55:47 Ship Emma
##>1.1 2021-11-23 20:56:07 900 2021-11-23 20:55:47 Ship Emma
##>1.2 2021-11-23 20:56:08 1000 2021-11-23 20:55:47 Ship Emma
##>1.3 2021-11-23 20:56:09 200 2021-11-23 20:55:47 Ship Emma
##>1.4 2021-11-23 20:56:10 300 2021-11-23 20:55:47 Ship Emma
##>1.5 2021-11-23 20:56:11 10 2021-11-23 20:55:47 Ship Emma
##>5 2021-11-23 22:13:56 1000 2021-11-23 22:16:01 Ship Amy
##>5.1 2021-11-23 22:13:57 450 2021-11-23 22:16:01 Ship Amy
##>5.2 2021-11-23 22:13:58 950 2021-11-23 22:16:01 Ship Amy
##>5.3 2021-11-23 22:13:59 600 2021-11-23 22:16:01 Ship Amy
##>12 2021-11-24 03:23:21 100 2021-11-24 03:23:37 Ship Sally
##>12.1 2021-11-24 03:23:22 750 2021-11-24 03:23:37 Ship Sally
##>12.2 2021-11-24 03:23:23 150 2021-11-24 03:23:37 Ship Sally
##>12.3 2021-11-24 03:23:24 200 2021-11-24 03:23:37 Ship Sally
##>12.4 2021-11-24 03:23:25 300 2021-11-24 03:23:37 Ship Sally
##>12.5 2021-11-24 03:24:34 400 2021-11-24 03:23:37 Ship Sally
##>12.6 2021-11-24 03:24:35 900 2021-11-24 03:23:37 Ship Sally
##>12.7 2021-11-24 03:24:36 1020 2021-11-24 03:23:37 Ship Sally
##>12.8 2021-11-24 03:24:37 800 2021-11-24 03:23:37 Ship Sally
The second approach involves first calculating the cartesian product of all the rows of df1
and df2
and then selecting only the rows with the minimum distance. The trick here is to use inner_join(..., by =character())
to get all the combinations of the two dataframes :
mutate(df1, id = row_number()) %>%
inner_join(mutate(df2, id = row_number()),by = character()) |>
mutate(dist = abs(date.x - date.y)) |>
group_by(id.x) |>
filter(dist == min(dist)) |>
select(-id.x, -id.y, -dist)
##> # A tibble: 19 × 7
##># Groups: id.x [19]
##> date.x value.x id.x date.y value.y id.y dist
##> <dttm> <dbl> <int> <dttm> <chr> <int> <drtn>
##> 1 2021-11-23 20:56:06 500 1 2021-11-23 20:55:47 Ship Emma 1 19 s…
##> 2 2021-11-23 20:56:07 900 2 2021-11-23 20:55:47 Ship Emma 1 20 s…
##> 3 2021-11-23 20:56:08 1000 3 2021-11-23 20:55:47 Ship Emma 1 21 s…
##> 4 2021-11-23 20:56:09 200 4 2021-11-23 20:55:47 Ship Emma 1 22 s…
##> 5 2021-11-23 20:56:10 300 5 2021-11-23 20:55:47 Ship Emma 1 23 s…
##> 6 2021-11-23 20:56:11 10 6 2021-11-23 20:55:47 Ship Emma 1 24 s…
##> 7 2021-11-23 22:13:56 1000 7 2021-11-23 22:16:01 Ship Amy 5 125 s…
##> 8 2021-11-23 22:13:57 450 8 2021-11-23 22:16:01 Ship Amy 5 124 s…
##> 9 2021-11-23 22:13:58 950 9 2021-11-23 22:16:01 Ship Amy 5 123 s…
##>10 2021-11-23 22:13:59 600 10 2021-11-23 22:16:01 Ship Amy 5 122 s…
##>11 2021-11-24 03:23:21 100 11 2021-11-24 03:23:37 Ship Sally 12 16 s…
##>12 2021-11-24 03:23:22 750 12 2021-11-24 03:23:37 Ship Sally 12 15 s…
##>13 2021-11-24 03:23:23 150 13 2021-11-24 03:23:37 Ship Sally 12 14 s…
##>14 2021-11-24 03:23:24 200 14 2021-11-24 03:23:37 Ship Sally 12 13 s…
##>15 2021-11-24 03:23:25 300 15 2021-11-24 03:23:37 Ship Sally 12 12 s…
##>16 2021-11-24 03:24:34 400 16 2021-11-24 03:23:37 Ship Sally 12 57 s…
##>17 2021-11-24 03:24:35 900 17 2021-11-24 03:23:37 Ship Sally 12 58 s…
##>18 2021-11-24 03:24:36 1020 18 2021-11-24 03:23:37 Ship Sally 12 59 s…
##>19 2021-11-24 03:24:37 800 19 2021-11-24 03:23:37 Ship Sally 12 60 s…