R dplyr join tables based on overlapping date time intervals-CodePudding

I have two tables, the label which contains the label time interval (start and end), and the items which have a unique timestamp for each item.

I want to map the items based on their timestamps. If the item timestamp falls within the label's time interval, therefore the item belongs to that label.

For example these two items a123, b682 are in the time interval which corresponds to label X547, so a123, b682 belongs to X547

# Label, start - end time
have_label_start_end <- data.frame(label = c('X547', 'X285', 'X290')
                   , time = c(lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 9L, min = 58L, sec = 51, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 4L, sec = 54, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 8L, sec = 34, tz = "UTC"))) %>% 
  dplyr::mutate(start_time = time, stop_time = lead(time)) %>% dplyr::select(-time) 

> have_label_start_end
  label          start_time           stop_time
1  X547 2022-04-11 09:58:51 2022-04-11 10:04:54
2  X285 2022-04-11 10:04:54 2022-04-11 11:08:34
3  X290 2022-04-11 11:08:34                <NA>


# Item
have_item_time <- data.frame(item = c('a123', 'b682', 'c3324', 'd4343', 'e5343')
           , timestamp = c(lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 9L, min = 59L, sec = 34, tz = "UTC"),
                      lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 3L, sec = 13, tz = "UTC"),
                      lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 5L, sec = 17, tz = "UTC"),
                      lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 8L, sec = 35, tz = "UTC"),
                      lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 10L, sec = 09, tz = "UTC")))

> have_item_time
   item           timestamp
1  a123 2022-04-11 09:59:34
2  b682 2022-04-11 10:03:13
3 c3324 2022-04-11 10:05:17
4 d4343 2022-04-11 11:08:35
5 e5343 2022-04-11 11:10:09


# Map Items to Label
want <- data.frame(label = c('X547', 'X547','X285', 'X290', 'X290'),
                   item = c('a123', 'b682', 'c3324', 'd4343', 'e5343')
                   , timestamp = c(lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 9L, min = 59L, sec = 34, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 3L, sec = 13, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 10L, min = 5L, sec = 17, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 8L, sec = 35, tz = "UTC"),
                              lubridate::make_datetime(year = 2022L, month = 4L, day = 11L, hour = 11L, min = 10L, sec = 09, tz = "UTC")))


> want
  label  item           timestamp
1  X547  a123 2022-04-11 09:59:34
2  X547  b682 2022-04-11 10:03:13
3  X285 c3324 2022-04-11 10:05:17
4  X290 d4343 2022-04-11 11:08:35
5  X290 e5343 2022-04-11 11:10:09

CodePudding user response：

Here's an approach that converts the intervals to a start and end row, adds the item_time table, and then fills the labels from those intervals to populate the item_time table.

library(tidyverse)
have_label_start_end %>%
  pivot_longer(-label, values_to = "timestamp") %>%
  bind_rows(have_item_time) %>%
  arrange(timestamp) %>%
  fill(label, .direction = "down") %>%
  filter(!is.na(item)) %>%
  select(label, item, timestamp)

This assumes every item is within one and only one interval. If it's possible that an item could be outside of any intervals, or in more than one, we'd need to revise this.

Another approach could be to use the fuzzyjoin package, which accommodates "non-equi" joins like this (as do data.table and sqldf). One tweak is that we need to replace any NA stop_times with a valid timestamp. In this case we can use the max from have_item_time.

Here, we get all the matches where the specific timestamp in have_item_time is between (inclusive) any of the intervals in have_label_start_end.

library(fuzzyjoin)
have_item_time %>%
  fuzzy_left_join(have_label_start_end %>%
                    replace_na(list(stop_time = max(have_item_time$timestamp))), 
                  by = c("timestamp" = "start_time",
                         "timestamp" = "stop_time"),
                  match_fun = list(`>=`, `<=`))

CodePudding user response：

We can use {powerjoin} :

library(powerjoin)
power_left_join(
  have_item_time, have_label_start_end, 
  by = ~.x$timestamp > .y$start_time & 
    (.x$timestamp < .y$stop_time | is.na(.y$stop_time)),
  keep = "left")
#>    item           timestamp label
#> 1  a123 2022-04-11 09:59:34  X547
#> 2  b682 2022-04-11 10:03:13  X547
#> 3 c3324 2022-04-11 10:05:17  X285
#> 4 d4343 2022-04-11 11:08:35  X290
#> 5 e5343 2022-04-11 11:10:09  X290