Is there an elegant way to obtain the closest smaller and bigger values compared to some reference value? Following example should clarify my goal:
set.seed(42)
df_Date <- data.frame(Date = seq(lubridate::date('2022-01-01'), lubridate::date('2022-01-20'), by = 2))
df_Dummy <- data.frame(Dummy = seq(-0.2, 0.2, by = 0.05))
my_df <- df_Date |>
dplyr::full_join(df_Dummy, by = character()) |>
dplyr::mutate(Value = rnorm(90)) |>
dplyr::mutate(given_date = lubridate::date('2022-01-10'),
given_Dummy = 0.17)
I want to obtain the rows, where Date
is closest to given_date
and Dummy
is closest to given_dummy
. For each case, I want the closest smaller AND bigger value, since I would like to interpolate in the end.
Right now I am achieving the desired result the following way:
temp <- my_df |>
dplyr::mutate(Diff_Date = difftime(given_date, Date, units = 'days'))
temp.pos <- temp |>
dplyr::filter(Diff_Date >= 0) |>
dplyr::filter(Diff_Date == min(Diff_Date))
temp.neg <- temp |>
dplyr::filter(Diff_Date <= 0) |>
dplyr::filter(Diff_Date == max(Diff_Date))
temp2 <- dplyr::bind_rows(temp.neg, temp.pos) |>
dplyr::distinct(.keep_all = TRUE) |>
dplyr::mutate(Diff_Dummy = Dummy - given_Dummy)
temp2.pos <- temp2 |>
dplyr::filter(Diff_Dummy >= 0) |>
dplyr::filter(Diff_Dummy == min(Diff_Dummy))
temp2.neg <- temp2 |>
dplyr::filter(Diff_Dummy <= 0) |>
dplyr::filter(Diff_Dummy == max(Diff_Dummy))
final_df <- dplyr::bind_rows(temp2.neg, temp2.pos) |>
dplyr::distinct(.keep_all = TRUE)
Desired output:
> final_df
Date Dummy Value given_date given_Dummy Diff_Date Diff_Dummy
1 2022-01-11 0.15 1.5757275 2022-01-10 0.17 -1 days -0.02
2 2022-01-09 0.15 -0.7267048 2022-01-10 0.17 1 days -0.02
3 2022-01-11 0.20 0.6428993 2022-01-10 0.17 -1 days 0.03
4 2022-01-09 0.20 -1.3682810 2022-01-10 0.17 1 days 0.03
Note that if there is an exact match between Dummy
and given_dummy
resp. Date
and given_date
, i.e. Diff_Date == 0
resp. Diff_Dummy == 0
, then only this row is wanted.
I would like a more concise code, maybe using dplyr
, which is easier to grasp and provides the same result. I came across the following threads, but I wasn't able to come up with a solution:
r-find two closest values in a vector
Thanks a lot!
CodePudding user response:
You can calculate the differences of the two columns, then use filter
to get your desired rows. We use min
when the difference is larger than 0 and use max
when the difference is smaller than 0.
library(tidyverse)
my_df %>%
mutate(Diff_date = Date - given_date,
Diff_dummy = Dummy - given_Dummy) %>%
filter((Diff_date == min(Diff_date[Diff_date > 0]) |
Diff_date == max(Diff_date[Diff_date < 0])) &
(Diff_dummy == min(Diff_dummy[Diff_dummy > 0]) |
Diff_dummy == max(Diff_dummy[Diff_dummy < 0])))
Date Dummy Value given_date given_Dummy Diff_date Diff_dummy
1 2022-01-09 0.15 -0.7267048 2022-01-10 0.17 -1 days -0.02
2 2022-01-09 0.20 -1.3682810 2022-01-10 0.17 -1 days 0.03
3 2022-01-11 0.15 1.5757275 2022-01-10 0.17 1 days -0.02
4 2022-01-11 0.20 0.6428993 2022-01-10 0.17 1 days 0.03
Data
my_df <- structure(list(Date = structure(c(18993, 18993, 18993, 18993,
18993, 18993, 18993, 18993, 18993, 18995, 18995, 18995, 18995,
18995, 18995, 18995, 18995, 18995, 18997, 18997, 18997, 18997,
18997, 18997, 18997, 18997, 18997, 18999, 18999, 18999, 18999,
18999, 18999, 18999, 18999, 18999, 19001, 19001, 19001, 19001,
19001, 19001, 19001, 19001, 19001, 19003, 19003, 19003, 19003,
19003, 19003, 19003, 19003, 19003, 19005, 19005, 19005, 19005,
19005, 19005, 19005, 19005, 19005, 19007, 19007, 19007, 19007,
19007, 19007, 19007, 19007, 19007, 19009, 19009, 19009, 19009,
19009, 19009, 19009, 19009, 19009, 19011, 19011, 19011, 19011,
19011, 19011, 19011, 19011, 19011), class = "Date"), Dummy = c(-0.2,
-0.15, -0.1, -0.05, 0, 0.05, 0.1, 0.15, 0.2, -0.2, -0.15, -0.1,
-0.05, 0, 0.05, 0.1, 0.15, 0.2, -0.2, -0.15, -0.1, -0.05, 0,
0.05, 0.1, 0.15, 0.2, -0.2, -0.15, -0.1, -0.05, 0, 0.05, 0.1,
0.15, 0.2, -0.2, -0.15, -0.1, -0.05, 0, 0.05, 0.1, 0.15, 0.2,
-0.2, -0.15, -0.1, -0.05, 0, 0.05, 0.1, 0.15, 0.2, -0.2, -0.15,
-0.1, -0.05, 0, 0.05, 0.1, 0.15, 0.2, -0.2, -0.15, -0.1, -0.05,
0, 0.05, 0.1, 0.15, 0.2, -0.2, -0.15, -0.1, -0.05, 0, 0.05, 0.1,
0.15, 0.2, -0.2, -0.15, -0.1, -0.05, 0, 0.05, 0.1, 0.15, 0.2),
Value = c(1.37095844714667, -0.564698171396089, 0.363128411337339,
0.63286260496104, 0.404268323140999, -0.106124516091484,
1.51152199743894, -0.0946590384130976, 2.01842371387704,
-0.062714099052421, 1.30486965422349, 2.28664539270111, -1.38886070111234,
-0.278788766817371, -0.133321336393658, 0.635950398070074,
-0.284252921416072, -2.65645542090478, -2.44046692857552,
1.32011334573019, -0.306638594078475, -1.78130843398, -0.171917355759621,
1.2146746991726, 1.89519346126497, -0.4304691316062, -0.25726938276893,
-1.76316308519478, 0.460097354831271, -0.639994875960119,
0.455450123241219, 0.704837337228819, 1.03510352196992, -0.608926375407211,
0.50495512329797, -1.71700867907334, -0.784459008379496,
-0.850907594176518, -2.41420764994663, 0.0361226068922556,
0.205998600200254, -0.361057298548666, 0.758163235699517,
-0.726704827076575, -1.36828104441929, 0.432818025888717,
-0.811393176186672, 1.44410126172125, -0.431446202613345,
0.655647883402207, 0.321925265203947, -0.783838940880375,
1.57572751979198, 0.642899305717316, 0.0897606465996057,
0.276550747291463, 0.679288816055271, 0.0898328865790817,
-2.99309008315293, 0.284882953530659, -0.367234642740975,
0.185230564865609, 0.581823727365507, 1.39973682729268, -0.727292059474465,
1.30254263204414, 0.335848119752074, 1.03850609869762, 0.920728568290646,
0.720878162866862, -1.04311893856785, -0.0901863866107067,
0.623518161999544, -0.953523357772344, -0.542828814573857,
0.580996497681682, 0.768178737834591, 0.463767588540167,
-0.885776297409679, -1.09978089864786, 1.51270700980493,
0.257921437532031, 0.0884402291595864, -0.120896537539089,
-1.19432889516053, 0.611996898040387, -0.217139845746521,
-0.182756706331922, 0.93334632857116, 0.821773110508249),
given_date = structure(c(19002, 19002, 19002, 19002, 19002,
19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002,
19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002,
19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002,
19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002,
19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002,
19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002,
19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002,
19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002,
19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002, 19002,
19002, 19002, 19002, 19002), class = "Date"), given_Dummy = c(0.17,
0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17,
0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17,
0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17,
0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17,
0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17,
0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17,
0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17,
0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17,
0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17, 0.17)), class = "data.frame", row.names = c(NA,
-90L))