I have a data.frame with the number of passengers travelling on trains on specific days with different operator.

df<-data.frame(date_of_sampling =c("2021-01-01","2021-02-04","2021-01-03","2021-02-03"),operator=c("A","A","B","B"),num_passengers=c(204,155,100,400))

I then have another data.frame with the prevalence of COVID on a weekly basis.

ONS <- data.frame(sample_date_midpoint=c("2020-05-03","2020-06-10","2020-06-20","2020-08-03","2021-01-01","2021-01-06","2021-02-05","2021-02-08"),prevalence=runif(8))

I would like to match the closest ONS prevalence data with the closest date in df.

So far I have:

BASE R

# get time differences
temp <- outer(df$date_of_sampling, ONS$sample_date_midpoint,  "-")

# remove where ONS are more than 5 days before or after df
temp[temp < -5 | temp > 5] <- NA

# find index of minimum
ind <- apply(temp, 1, function(i) which.min(i))

# output
df2 <- cbind(ONS,  df[ind,])

Problem: How to find unique date to bind?

Data.table approach

setDT(df)            ## convert to data.table by reference
setDT(ONS)            ## same

df[, date := date_of_sampling]  ## create a duplicate of 'df'
setkey(df, date_of_sampling)    ## set the column to perform the join on
ONS[, date := sample_date_midpoint]  ## create a duplicate of 'ONS'
setkey(ONS, date)    ## same as above

ONS[df, roll=5]

Works but, what happens if there are multiple sampling days close together?

Dplyr approach ?

CodePudding user response：

You could convert character dates to Date and use roll='nearest':

setDT(df)            ## convert to data.table by reference
setDT(ONS)            ## same

df[, date := as.Date(date_of_sampling)]  ## create a duplicate of 'df'
setkey(df, date)    ## set the column to perform the join on
ONS[, date := as.Date(sample_date_midpoint)]  ## create a duplicate of 'ONS'
setkey(ONS, date)    ## same as above

ONS[df, roll='nearest'][
    abs(difftime(sample_date_midpoint,date_of_sampling,unit='day'))<5]  

# Key: <date>
#   sample_date_midpoint prevalence       date date_of_sampling operator num_passengers
# <char>      <num>     <Date>           <char>   <char>          <num>
# 1:           2021-01-01  0.1964160 2021-01-01       2021-01-01        A            204
# 2:           2021-01-01  0.1964160 2021-01-03       2021-01-03        B            100
# 3:           2021-02-05  0.3906553 2021-02-03       2021-02-03        B            400
# 4:           2021-02-05  0.3906553 2021-02-04       2021-02-04        A            155

CodePudding user response：

Possible dplyr approach:

library(dplyr)

# Date formatting
ONS <- ONS |> mutate(sample_date_midpoint = as.Date(sample_date_midpoint))
df <- df |> mutate(date_of_sampling = as.Date(date_of_sampling))

# Identify closest   Join
df |>
  group_by(date_of_sampling) |>
   mutate(nearest_sample_date_midpoint = ONS$sample_date_midpoint[which.min(abs(ONS$sample_date_midpoint - first(date_of_sampling)))]) |>
  ungroup() |>
  left_join(ONS, by = c("nearest_sample_date_midpoint" = "sample_date_midpoint")) # |>
  # filter(as.numeric(nearest_sample_date_midpoint - date_of_sampling, unit = "days") < 5)

Output:

# A tibble: 5 × 5
  date_of_sampling operator num_passengers nearest_sample_date_midpoint prevalence
  <date>           <chr>             <dbl> <date>                            <dbl>
1 2021-01-01       A                   204 2021-01-01                       0.516 
2 2021-02-04       A                   155 2021-02-05                       0.0171
3 2021-01-03       B                   100 2021-01-01                       0.516 
4 2021-02-03       B                   400 2021-02-05                       0.0171
5 2019-02-03       C                  1000 2020-05-03                       0.208

Data with edge case:

df <- data.frame(date_of_sampling = c("2021-01-01","2021-02-04","2021-01-03","2021-02-03", "2020-02-03"),
                 operator = c("A","A","B","B", "C"),
                 num_passengers = c(204,155,100,400,1000)
                 )

Updated with edge case filter