I have two tables, the label
which contains the label
time interval (start and end), and the items
which have a unique timestamp for each item
.
I want to map the items
based on their timestamps. If the item timestamp falls within the label
's time interval, therefore the item
belongs to that label
.
For example these two items
a123
, b682
are in the time interval which corresponds to label
X547
, so a123
, b682
belongs to X547
# Label, start - end time
have_label_start_end <- data.frame(label = c('X547', 'X285', 'X290')
, time = c(lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 9L, min = 58L, sec = 51, tz = "UTC"),
lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 4L, sec = 54, tz = "UTC"),
lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 8L, sec = 34, tz = "UTC"))) %>%
dplyr::mutate(start_time = time, stop_time = lead(time)) %>% dplyr::select(-time)
> have_label_start_end
label start_time stop_time
1 X547 2022-04-11 09:58:51 2022-04-11 10:04:54
2 X285 2022-04-11 10:04:54 2022-04-11 11:08:34
3 X290 2022-04-11 11:08:34 <NA>
# Item
have_item_time <- data.frame(item = c('a123', 'b682', 'c3324', 'd4343', 'e5343')
, timestamp = c(lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 9L, min = 59L, sec = 34, tz = "UTC"),
lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 3L, sec = 13, tz = "UTC"),
lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 5L, sec = 17, tz = "UTC"),
lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 8L, sec = 35, tz = "UTC"),
lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 10L, sec = 09, tz = "UTC")))
> have_item_time
item timestamp
1 a123 2022-04-11 09:59:34
2 b682 2022-04-11 10:03:13
3 c3324 2022-04-11 10:05:17
4 d4343 2022-04-11 11:08:35
5 e5343 2022-04-11 11:10:09
# Map Items to Label
want <- data.frame(label = c('X547', 'X547','X285', 'X290', 'X290'),
item = c('a123', 'b682', 'c3324', 'd4343', 'e5343')
, timestamp = c(lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 9L, min = 59L, sec = 34, tz = "UTC"),
lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 3L, sec = 13, tz = "UTC"),
lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 5L, sec = 17, tz = "UTC"),
lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 8L, sec = 35, tz = "UTC"),
lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 10L, sec = 09, tz = "UTC")))
> want
label item timestamp
1 X547 a123 2022-04-11 09:59:34
2 X547 b682 2022-04-11 10:03:13
3 X285 c3324 2022-04-11 10:05:17
4 X290 d4343 2022-04-11 11:08:35
5 X290 e5343 2022-04-11 11:10:09
CodePudding user response:
Here's an approach that converts the intervals to a start and end row, adds the item_time table, and then fills the labels from those intervals to populate the item_time table.
library(tidyverse)
have_label_start_end %>%
pivot_longer(-label, values_to = "timestamp") %>%
bind_rows(have_item_time) %>%
arrange(timestamp) %>%
fill(label, .direction = "down") %>%
filter(!is.na(item)) %>%
select(label, item, timestamp)
This assumes every item is within one and only one interval. If it's possible that an item could be outside of any intervals, or in more than one, we'd need to revise this.
Another approach could be to use the fuzzyjoin
package, which accommodates "non-equi" joins like this (as do data.table
and sqldf
). One tweak is that we need to replace any NA
stop_times with a valid timestamp. In this case we can use the max from have_item_time
.
Here, we get all the matches where the specific timestamp in have_item_time
is between (inclusive) any of the intervals in have_label_start_end
.
library(fuzzyjoin)
have_item_time %>%
fuzzy_left_join(have_label_start_end %>%
replace_na(list(stop_time = max(have_item_time$timestamp))),
by = c("timestamp" = "start_time",
"timestamp" = "stop_time"),
match_fun = list(`>=`, `<=`))
CodePudding user response:
We can use {powerjoin} :
library(powerjoin)
power_left_join(
have_item_time, have_label_start_end,
by = ~.x$timestamp > .y$start_time &
(.x$timestamp < .y$stop_time | is.na(.y$stop_time)),
keep = "left")
#> item timestamp label
#> 1 a123 2022-04-11 09:59:34 X547
#> 2 b682 2022-04-11 10:03:13 X547
#> 3 c3324 2022-04-11 10:05:17 X285
#> 4 d4343 2022-04-11 11:08:35 X290
#> 5 e5343 2022-04-11 11:10:09 X290