Home > Enterprise >  Left Join Subset of Column Based on Date Interval
Left Join Subset of Column Based on Date Interval

Time:09-23

I have two dataframes: df1 - series of MRI reads for subject 1 by 2 different MRI readers; df2 - list of steroid use for subject 1

I am trying to perform a left join of df2$STATUS to df1 based on whether df1$DOS is contained within the lubridate::interval(start = df2$START_DATE, end = df2$STOP_DATE). If there is no df2$STATUS available for that df1$DOS, I want the value to be NA.

As example, the final df should look something like this, since df1$DOS falls within the first row lubridate::interval(start = df2$START_DATE, end = df2$STOP_DATE):

left join example

dput examples

df1 = structure(list(SUBJID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), READER = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), 
    DOS = structure(c(15469, 15497, 15530, 15579, 15635, 15691, 
    15747, 15805, 15901, 16010, 16101, 16202, 16293, 16422, 16547, 
    16678, 16841, 17035, 17217, 17331, 17372, 17497, 17588, 17652, 
    17686, 17714, 17770, 17826, 17882, 17938, 17994, 18050, 18079, 
    18113, 18176, 15469, 15497, 15530, 15579, 15635, 15691, 15747, 
    15805, 15901, 16010, 16101, 16202, 16293, 16422, 16547, 16678, 
    16841, 17035, 17217, 17331, 17372, 17497, 17588, 17652, 17686, 
    17714, 17770, 17826, 17882, 17938, 17994, 18050, 18079, 18113, 
    18176), class = "Date"), VISIT = c("1 Screening", "1 Week 04", 
    "1 Week 08", "1 Week 16", "Unscheduled 01", "Unscheduled 02", 
    "Unscheduled 03", "Unscheduled 04", "Unscheduled 05", "Unscheduled 06", 
    "Unscheduled 07", "Unscheduled 08", "Unscheduled 09", "Unscheduled 10", 
    "Unscheduled 11", "Unscheduled 12", "Unscheduled 13", "Unscheduled 14", 
    "Unscheduled 15", "Unscheduled", "Unscheduled 16", "Unscheduled 17", 
    "Unscheduled", "2 Screening", "2 Week 04", "2 Week 08", "2 Week 16", 
    "Unscheduled 18", "Unscheduled 19", "Unscheduled 20", "Unscheduled 21", 
    "Unscheduled 22", "Unscheduled 23", "Unscheduled 24", "Unscheduled 25", 
    "1 Screening", "1 Week 04", "1 Week 08", "1 Week 16", "Unscheduled 01", 
    "Unscheduled 02", "Unscheduled 03", "Unscheduled 04", "Unscheduled 05", 
    "Unscheduled 06", "Unscheduled 07", "Unscheduled 08", "Unscheduled 09", 
    "Unscheduled 10", "Unscheduled 11", "Unscheduled 12", "Unscheduled 13", 
    "Unscheduled 14", "Unscheduled 15", "Unscheduled", "Unscheduled 16", 
    "Unscheduled 17", "Unscheduled", "2 Screening", "2 Week 04", 
    "2 Week 08", "2 Week 16", "Unscheduled 18", "Unscheduled 19", 
    "Unscheduled 20", "Unscheduled 21", "Unscheduled 22", "Unscheduled 23", 
    "Unscheduled 24", "Unscheduled 25"), MOS_DUR = c(0, 0.903225806451613, 
    2, 3.58064516129032, 5.41935483870968, 7.25806451612903, 
    9.07142857142857, 11.0333333333333, 14.1935483870968, 17.741935483871, 
    20.7096774193548, 24.0967741935484, 27.0645161290323, 31.2903225806452, 
    35.4333333333333, 39.7096774193548, 45.0344827586207, 51.4193548387097, 
    57.3928571428571, 61.1666666666667, 62.5161290322581, 66.6, 
    69.6071428571429, 71.7333333333333, 72.8387096774194, 73.7666666666667, 
    75.5806451612903, 77.4193548387097, 79.258064516129, 81.0714285714286, 
    82.9677419354839, 84.8064516129032, 85.7666666666667, 86.8709677419355, 
    88.9333333333333, 0, 0.903225806451613, 2, 3.58064516129032, 
    5.41935483870968, 7.25806451612903, 9.07142857142857, 11.0333333333333, 
    14.1935483870968, 17.741935483871, 20.7096774193548, 24.0967741935484, 
    27.0645161290323, 31.2903225806452, 35.4333333333333, 39.7096774193548, 
    45.0344827586207, 51.4193548387097, 57.3928571428571, 61.1666666666667, 
    62.5161290322581, 66.6, 69.6071428571429, 71.7333333333333, 
    72.8387096774194, 73.7666666666667, 75.5806451612903, 77.4193548387097, 
    79.258064516129, 81.0714285714286, 82.9677419354839, 84.8064516129032, 
    85.7666666666667, 86.8709677419355, 88.9333333333333), DAYS_DUR = c(0, 
    28, 61, 110, 166, 222, 278, 336, 432, 541, 632, 733, 824, 
    953, 1078, 1209, 1372, 1566, 1748, 1862, 1903, 2028, 2119, 
    2183, 2217, 2245, 2301, 2357, 2413, 2469, 2525, 2581, 2610, 
    2644, 2707, 0, 28, 61, 110, 166, 222, 278, 336, 432, 541, 
    632, 733, 824, 953, 1078, 1209, 1372, 1566, 1748, 1862, 1903, 
    2028, 2119, 2183, 2217, 2245, 2301, 2357, 2413, 2469, 2525, 
    2581, 2610, 2644, 2707), SPD = c(1845.52, 1658.84, 1924.06, 
    1914.49, 2094.22, 1890.46, 1369.93, 985.96, 570.47, 348.88, 
    500.76, 416.6, 407.92, 409.56, 362.33, 329.35, 326.93, 267.39, 
    304.2, 825.13, 399.47, 438.16, 800.82, 1947.56, 2357.86, 
    2959.97, 3066.57, 4235.27, 3308.1, 5321.45, 2840.12, 4162.53, 
    5379.55, 4132.86, 6570.47, 1375.57, 1744.6, 2522.15, 1828.55, 
    2013.06, 1555.05, 1294.24, 1048.76, 439.1, 402, 276.74, 121.37, 
    142.34, 72.9, 264.92, 62.27, 72.96, 68.91, 51.56, 251.33, 
    86.53, 134.48, 337.51, 1209.22, 1158.13, 1165.09, 2432.43, 
    2141.32, 3463.67, 5276.63, 3883.76, 4004.92, 4678.29, 4942.69, 
    5159.52), SPD_NADIR = c(1845.52, 1658.84, 1658.84, 1658.84, 
    1658.84, 1658.84, 1369.93, 985.96, 570.47, 348.88, 348.88, 
    348.88, 348.88, 348.88, 348.88, 329.35, 326.93, 267.39, 267.39, 
    267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 
    267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1294.24, 
    1048.76, 439.1, 402, 276.74, 121.37, 121.37, 72.9, 72.9, 
    62.27, 62.27, 62.27, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 
    51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 
    51.56, 51.56), PCT_DIFF_NADIR = c(NA, -0.101153062551476, 
    0.15988280967423, 0.154113718019821, 0.262460514576451, 0.139627691639941, 
    -0.174163873550192, -0.280284394093129, -0.421406547932979, 
    -0.38843409819973, 0.435335932125659, 0.194106856225636, 
    0.169227241458381, 0.173927998165558, 0.0385519376289841, 
    -0.0559791332263242, -0.00734780628510708, -0.182118496314196, 
    0.137664086166274, 2.0858670855305, 0.493960133138861, 0.638655147911291, 
    1.99495119488388, 6.28359325330042, 7.81805602303751, 10.0698605033846, 
    10.4685291147762, 14.8392984030816, 11.3718164478851, 18.9014548038446, 
    9.62163880474214, 14.5672613037137, 19.1187404166199, 14.4562997868282, 
    23.5726092972811, NA, 0.268274242677581, 0.833530827220716, 
    0.329303488735579, 0.463436975217546, 0.13047682051804, -0.0591245810827511, 
    -0.189671158363209, -0.581315076852664, -0.0844910043270326, 
    -0.311592039800995, -0.561429500614295, 0.172777457361786, 
    -0.399357337068468, 2.63401920438957, -0.145816186556927, 
    0.171671752047535, 0.106632407258712, -0.171992933997109, 
    3.87451512800621, 0.678238944918541, 1.60822342901474, 5.54596586501164, 
    22.4526764934057, 21.4617920868891, 21.5967804499612, 46.1766873545384, 
    40.5306439100078, 66.1774631497285, 101.339604344453, 74.3250581846393, 
    76.6749418153607, 89.7348719937936, 94.8628782001551, 99.0682699767261
    ), SPD_BL = c(1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57), PCT_DIFF_BL = c(NA, -0.101153062551476, 
    0.042557111274871, 0.0373715809094456, 0.134758767176731, 
    0.024350860462092, -0.257699726906238, -0.465754909185487, 
    -0.690889288655772, -0.810958429060644, -0.728661840565261, 
    -0.77426416402965, -0.778967445489618, -0.77807880705709, 
    -0.803670510208505, -0.821540812345572, -0.822852095886254, 
    -0.855114005808661, -0.835168407820018, -0.552901079370584, 
    -0.783546100827951, -0.762581819758117, -0.566073518574711, 
    0.0552906497897611, 0.277612813732715, 0.60386774459231, 
    0.661629242706663, 1.29489249642377, 0.792502926004595, 1.88344206510902, 
    0.53892669816637, 1.25547813082492, 1.91492370713945, 1.23940136113399, 
    2.56022692791192, NA, 0.268274242677581, 0.833530827220716, 
    0.329303488735579, 0.463436975217546, 0.13047682051804, -0.0591245810827511, 
    -0.237581511664256, -0.680786873805041, -0.707757511431625, 
    -0.798817944561164, -0.911767485478747, -0.896522895963128, 
    -0.947003787520809, -0.807410746090711, -0.954731493126486, 
    -0.946960169238934, -0.949904403265555, -0.96251735644133, 
    -0.817290286935598, -0.93709516782134, -0.902236890888868, 
    -0.754639894734546, -0.120931686500869, -0.158072653518178, 
    -0.153012932820576, 0.768306956388988, 0.556678322440879, 
    1.51798890641698, 2.83595891157847, 1.82338230697093, 1.91146215750562, 
    2.40098286528494, 2.59319409408463, 2.75082329507041), NLFLG = c(0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 
    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1), CLINDET_DT = structure(c(18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177), class = "Date")), row.names = c(NA, 
-70L), groups = structure(list(SUBJID = c(1, 1), READER = structure(c(1L, 
4L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), 
    .rows = structure(list(1:35, 36:70), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

df2 = structure(list(SUBJID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1), DRUG = structure(c("Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone"), label = "Agent Name", format.sas = "$"), 
    ROUTE = structure(c("PO", "IVI", "IVI", "PO", "PO", "PO", 
    "PO", "PO", "IVI", "PO", "PO", "PO", "PO", "PO", "PO", "PO"
    ), label = "Route", format.sas = "$"), START_DT = structure(c(15455, 
    15469, 15470, 15471, 15472, 15473, 15474, 15475, 17653, 18061, 
    18079, 18115, 18122, 18129, 18136, 18142), class = "Date"), 
    STOP_DT = structure(c(15468, 15469, 15470, 15471, 15472, 
    15473, 15474, 15475, 17653, 18067, 18114, 18121, 18128, 18135, 
    18141, NA), class = "Date"), DAYS_DUR = c(13, 0, 0, 0, 0, 
    0, 0, 0, 0, 6, 35, 6, 6, 6, 5, NA), DEXEQ = c(0, 10, 30, 
    8, 8, 4, 2, 1, 4, 4, 4, 3.5, 3, 2.5, 2, 8), PHYSFLAG = c(0, 
    1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1), STATUS = c("STABLE", 
    "INCREASE", "INCREASE", "INCREASE", "INCREASE", "INCREASE", 
    "STABLE", "STABLE", "INCREASE", "INCREASE", "INCREASE", "INCREASE", 
    "INCREASE", "INCREASE", "STABLE", "INCREASE")), row.names = c(NA, 
-16L), groups = structure(list(SUBJID = 1, .rows = structure(list(
    1:16), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", 
"list"))), row.names = c(NA, -1L), class = c("tbl_df", "tbl", 
"data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df", 
"tbl", "data.frame"))

example  = structure(list(SUBJID = 1, READER = structure(1L, .Label = c("1", 
"2", "3", "4", "5"), class = "factor"), DOS = structure(15469, class = "Date"), 
    VISIT = "1 Screening", MOS_DUR = 0, DAYS_DUR = 0, SPD = 1845.52, 
    SPD_NADIR = 1845.52, PCT_DIFF_NADIR = NA_real_, SPD_BL = 1845.52, 
    PCT_DIFF_BL = NA_real_, NLFLG = 0, CLINDET_DT = structure(18177, class = "Date"), 
    STATUS = "STABLE"), row.names = c(NA, -1L), class = c("tbl_df", 
"tbl", "data.frame"))

CodePudding user response:

Here is one solution using foverlaps in the data.table package (and dplyr just for some cleaning). Basically we are treating the DOS variable as an interval here by setting same value for start and end date and then we join via overlapping intervals.

Another solution would be to full_join by ID and then e.g. use %within% from lubridate but this would not preserve the rows for which there is no match.

Note that we cannot include those rows in df2 which have NA dates for the start or end variable. These are therefore removed before the join.

library(data.table)
library(dplyr)

dt1 <- as.data.table(df1 %>% mutate(start = DOS, end = DOS))
dt2 <- as.data.table(df2 %>% 
                       select(SUBJID, start = START_DT, end =STOP_DT, STATUS) %>%
                       filter(!is.na(start)&!is.na(end)))
setkey(dt1, SUBJID, start, end)
setkey(dt2, SUBJID, start, end)
df3 <- foverlaps(dt1, dt2, type="within") %>% 
  select(-i.start, -i.end)

Tidyverse solution

Here we apply one full join and filter to find all the matches and then one full join with the original df1 data set to keep the rows with no match at all.

library(lubridate)

df3 <- df1 %>% 
  full_join(df1 %>%
              full_join(df2 %>% 
                          select(SUBJID,START_DT, STOP_DT, STATUS), by = "SUBJID") %>%
              filter(DOS %within% interval(START_DT, STOP_DT)))
  • Related