Let's say we have two dataframes:
df1
structure(list(ID = c("A", "A", "A", "A", "B", "B", "C", "C",
"C", "C"), Date.X = structure(c(1452643200, 1452643200, 1452643200,
1560556800, 1491177600, 1537142400, 1524700800, 1524787200, 1452297600,
1417132800), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
Score.X = c(40, 50, 10, 45, 60, 100, 90, 400, 376, 276)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
df2
structure(list(ID = c("A", "A", "A", "B", "B", "B", "B", "C",
"C"), Date.Y = structure(c(1433462400, 1474588800, 1511740800,
1431475200, 1442188800, 1537747200, 1577404800, 1451606400, 1493942400
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Score.Y = c(35,
39, 130, 240, 126, 100, 95, 327, 257)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -9L))
I want to get the score from df1 into the df2, by have matching the same ID and approximate date.
Expected result:
structure(list(ID = c("A", "A", "A", "A", "B", "B", "C", "C",
"C", "C"), Date.X = structure(c(1452643200, 1452643200, 1452643200,
1560556800, 1491177600, 1537142400, 1524700800, 1524787200, 1452297600,
1417132800), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
Score.X = c(40, 50, 10, 45, 60, 100, 90, 400, 376, 276),
Score.Y = c(35, 35, 35, 130, 100, 100, 257, 257, 327, 327
), Date.Y = structure(c(1433462400, 1433462400, 1433462400,
1511740800, 1537747200, 1537747200, 1493942400, 1493942400,
1451606400, 1451606400), class = c("POSIXct", "POSIXt"), tzone = "UTC")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
I tried to code it myself with R with dplyr but could not get a result.
library(dplyr)
result <- df1 %>%
left_join(df2, by="ID") %>%
group_by(ID) %>%
mutate(diff = abs(as.numeric(difftime(Date.X, Date.Y, units="days")))) %>%
slice_min(diff) %>%
select(-diff
CodePudding user response:
Perhaps you can do it using this, but I'm not really sure what you are trying to achieve here.
inner_join(df2,df1, by="ID") %>%
mutate(date_diff = abs(as.numeric(difftime(Date.X,Date.Y,units = "days")))) %>%
group_by(ID, Date.X, Score.X) %>%
slice_min(date_diff) %>%
select(-date_diff) %>%
relocate(ID, Date.X, Score.X, Score.Y, Date.Y)
Output:
ID Date.X Score.X Score.Y Date.Y
<chr> <dttm> <dbl> <dbl> <dttm>
1 A 2016-01-13 00:00:00 10 35 2015-06-05 00:00:00
2 A 2016-01-13 00:00:00 40 35 2015-06-05 00:00:00
3 A 2016-01-13 00:00:00 50 35 2015-06-05 00:00:00
4 A 2019-06-15 00:00:00 45 130 2017-11-27 00:00:00
5 B 2017-04-03 00:00:00 60 100 2018-09-24 00:00:00
6 B 2018-09-17 00:00:00 100 100 2018-09-24 00:00:00
7 C 2014-11-28 00:00:00 276 327 2016-01-01 00:00:00
8 C 2016-01-09 00:00:00 376 327 2016-01-01 00:00:00
9 C 2018-04-26 00:00:00 90 257 2017-05-05 00:00:00
10 C 2018-04-27 00:00:00 400 257 2017-05-05 00:00:00