I have two dataframes: df1
- series of MRI reads for subject 1 by 2 different MRI readers; df2
- list of steroid use for subject 1
I am trying to perform a left join of df2$STATUS
to df1
based on whether df1$DOS
is contained within the lubridate::interval(start = df2$START_DATE, end = df2$STOP_DATE)
. If there is no df2$STATUS
available for that df1$DOS
, I want the value to be NA
.
As example, the final df should look something like this, since df1$DOS
falls within the first row lubridate::interval(start = df2$START_DATE, end = df2$STOP_DATE)
:
dput examples
df1 = structure(list(SUBJID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), READER = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L), .Label = c("1", "2", "3", "4", "5"), class = "factor"),
DOS = structure(c(15469, 15497, 15530, 15579, 15635, 15691,
15747, 15805, 15901, 16010, 16101, 16202, 16293, 16422, 16547,
16678, 16841, 17035, 17217, 17331, 17372, 17497, 17588, 17652,
17686, 17714, 17770, 17826, 17882, 17938, 17994, 18050, 18079,
18113, 18176, 15469, 15497, 15530, 15579, 15635, 15691, 15747,
15805, 15901, 16010, 16101, 16202, 16293, 16422, 16547, 16678,
16841, 17035, 17217, 17331, 17372, 17497, 17588, 17652, 17686,
17714, 17770, 17826, 17882, 17938, 17994, 18050, 18079, 18113,
18176), class = "Date"), VISIT = c("1 Screening", "1 Week 04",
"1 Week 08", "1 Week 16", "Unscheduled 01", "Unscheduled 02",
"Unscheduled 03", "Unscheduled 04", "Unscheduled 05", "Unscheduled 06",
"Unscheduled 07", "Unscheduled 08", "Unscheduled 09", "Unscheduled 10",
"Unscheduled 11", "Unscheduled 12", "Unscheduled 13", "Unscheduled 14",
"Unscheduled 15", "Unscheduled", "Unscheduled 16", "Unscheduled 17",
"Unscheduled", "2 Screening", "2 Week 04", "2 Week 08", "2 Week 16",
"Unscheduled 18", "Unscheduled 19", "Unscheduled 20", "Unscheduled 21",
"Unscheduled 22", "Unscheduled 23", "Unscheduled 24", "Unscheduled 25",
"1 Screening", "1 Week 04", "1 Week 08", "1 Week 16", "Unscheduled 01",
"Unscheduled 02", "Unscheduled 03", "Unscheduled 04", "Unscheduled 05",
"Unscheduled 06", "Unscheduled 07", "Unscheduled 08", "Unscheduled 09",
"Unscheduled 10", "Unscheduled 11", "Unscheduled 12", "Unscheduled 13",
"Unscheduled 14", "Unscheduled 15", "Unscheduled", "Unscheduled 16",
"Unscheduled 17", "Unscheduled", "2 Screening", "2 Week 04",
"2 Week 08", "2 Week 16", "Unscheduled 18", "Unscheduled 19",
"Unscheduled 20", "Unscheduled 21", "Unscheduled 22", "Unscheduled 23",
"Unscheduled 24", "Unscheduled 25"), MOS_DUR = c(0, 0.903225806451613,
2, 3.58064516129032, 5.41935483870968, 7.25806451612903,
9.07142857142857, 11.0333333333333, 14.1935483870968, 17.741935483871,
20.7096774193548, 24.0967741935484, 27.0645161290323, 31.2903225806452,
35.4333333333333, 39.7096774193548, 45.0344827586207, 51.4193548387097,
57.3928571428571, 61.1666666666667, 62.5161290322581, 66.6,
69.6071428571429, 71.7333333333333, 72.8387096774194, 73.7666666666667,
75.5806451612903, 77.4193548387097, 79.258064516129, 81.0714285714286,
82.9677419354839, 84.8064516129032, 85.7666666666667, 86.8709677419355,
88.9333333333333, 0, 0.903225806451613, 2, 3.58064516129032,
5.41935483870968, 7.25806451612903, 9.07142857142857, 11.0333333333333,
14.1935483870968, 17.741935483871, 20.7096774193548, 24.0967741935484,
27.0645161290323, 31.2903225806452, 35.4333333333333, 39.7096774193548,
45.0344827586207, 51.4193548387097, 57.3928571428571, 61.1666666666667,
62.5161290322581, 66.6, 69.6071428571429, 71.7333333333333,
72.8387096774194, 73.7666666666667, 75.5806451612903, 77.4193548387097,
79.258064516129, 81.0714285714286, 82.9677419354839, 84.8064516129032,
85.7666666666667, 86.8709677419355, 88.9333333333333), DAYS_DUR = c(0,
28, 61, 110, 166, 222, 278, 336, 432, 541, 632, 733, 824,
953, 1078, 1209, 1372, 1566, 1748, 1862, 1903, 2028, 2119,
2183, 2217, 2245, 2301, 2357, 2413, 2469, 2525, 2581, 2610,
2644, 2707, 0, 28, 61, 110, 166, 222, 278, 336, 432, 541,
632, 733, 824, 953, 1078, 1209, 1372, 1566, 1748, 1862, 1903,
2028, 2119, 2183, 2217, 2245, 2301, 2357, 2413, 2469, 2525,
2581, 2610, 2644, 2707), SPD = c(1845.52, 1658.84, 1924.06,
1914.49, 2094.22, 1890.46, 1369.93, 985.96, 570.47, 348.88,
500.76, 416.6, 407.92, 409.56, 362.33, 329.35, 326.93, 267.39,
304.2, 825.13, 399.47, 438.16, 800.82, 1947.56, 2357.86,
2959.97, 3066.57, 4235.27, 3308.1, 5321.45, 2840.12, 4162.53,
5379.55, 4132.86, 6570.47, 1375.57, 1744.6, 2522.15, 1828.55,
2013.06, 1555.05, 1294.24, 1048.76, 439.1, 402, 276.74, 121.37,
142.34, 72.9, 264.92, 62.27, 72.96, 68.91, 51.56, 251.33,
86.53, 134.48, 337.51, 1209.22, 1158.13, 1165.09, 2432.43,
2141.32, 3463.67, 5276.63, 3883.76, 4004.92, 4678.29, 4942.69,
5159.52), SPD_NADIR = c(1845.52, 1658.84, 1658.84, 1658.84,
1658.84, 1658.84, 1369.93, 985.96, 570.47, 348.88, 348.88,
348.88, 348.88, 348.88, 348.88, 329.35, 326.93, 267.39, 267.39,
267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39,
267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39,
1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1294.24,
1048.76, 439.1, 402, 276.74, 121.37, 121.37, 72.9, 72.9,
62.27, 62.27, 62.27, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56,
51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56,
51.56, 51.56), PCT_DIFF_NADIR = c(NA, -0.101153062551476,
0.15988280967423, 0.154113718019821, 0.262460514576451, 0.139627691639941,
-0.174163873550192, -0.280284394093129, -0.421406547932979,
-0.38843409819973, 0.435335932125659, 0.194106856225636,
0.169227241458381, 0.173927998165558, 0.0385519376289841,
-0.0559791332263242, -0.00734780628510708, -0.182118496314196,
0.137664086166274, 2.0858670855305, 0.493960133138861, 0.638655147911291,
1.99495119488388, 6.28359325330042, 7.81805602303751, 10.0698605033846,
10.4685291147762, 14.8392984030816, 11.3718164478851, 18.9014548038446,
9.62163880474214, 14.5672613037137, 19.1187404166199, 14.4562997868282,
23.5726092972811, NA, 0.268274242677581, 0.833530827220716,
0.329303488735579, 0.463436975217546, 0.13047682051804, -0.0591245810827511,
-0.189671158363209, -0.581315076852664, -0.0844910043270326,
-0.311592039800995, -0.561429500614295, 0.172777457361786,
-0.399357337068468, 2.63401920438957, -0.145816186556927,
0.171671752047535, 0.106632407258712, -0.171992933997109,
3.87451512800621, 0.678238944918541, 1.60822342901474, 5.54596586501164,
22.4526764934057, 21.4617920868891, 21.5967804499612, 46.1766873545384,
40.5306439100078, 66.1774631497285, 101.339604344453, 74.3250581846393,
76.6749418153607, 89.7348719937936, 94.8628782001551, 99.0682699767261
), SPD_BL = c(1845.52, 1845.52, 1845.52, 1845.52, 1845.52,
1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52,
1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52,
1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52,
1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52,
1845.52, 1845.52, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57,
1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57,
1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57,
1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57,
1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57,
1375.57, 1375.57), PCT_DIFF_BL = c(NA, -0.101153062551476,
0.042557111274871, 0.0373715809094456, 0.134758767176731,
0.024350860462092, -0.257699726906238, -0.465754909185487,
-0.690889288655772, -0.810958429060644, -0.728661840565261,
-0.77426416402965, -0.778967445489618, -0.77807880705709,
-0.803670510208505, -0.821540812345572, -0.822852095886254,
-0.855114005808661, -0.835168407820018, -0.552901079370584,
-0.783546100827951, -0.762581819758117, -0.566073518574711,
0.0552906497897611, 0.277612813732715, 0.60386774459231,
0.661629242706663, 1.29489249642377, 0.792502926004595, 1.88344206510902,
0.53892669816637, 1.25547813082492, 1.91492370713945, 1.23940136113399,
2.56022692791192, NA, 0.268274242677581, 0.833530827220716,
0.329303488735579, 0.463436975217546, 0.13047682051804, -0.0591245810827511,
-0.237581511664256, -0.680786873805041, -0.707757511431625,
-0.798817944561164, -0.911767485478747, -0.896522895963128,
-0.947003787520809, -0.807410746090711, -0.954731493126486,
-0.946960169238934, -0.949904403265555, -0.96251735644133,
-0.817290286935598, -0.93709516782134, -0.902236890888868,
-0.754639894734546, -0.120931686500869, -0.158072653518178,
-0.153012932820576, 0.768306956388988, 0.556678322440879,
1.51798890641698, 2.83595891157847, 1.82338230697093, 1.91146215750562,
2.40098286528494, 2.59319409408463, 2.75082329507041), NLFLG = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1), CLINDET_DT = structure(c(18177,
18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177,
18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177,
18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177,
18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177,
18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177,
18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177,
18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177,
18177, 18177, 18177, 18177, 18177, 18177), class = "Date")), row.names = c(NA,
-70L), groups = structure(list(SUBJID = c(1, 1), READER = structure(c(1L,
4L), .Label = c("1", "2", "3", "4", "5"), class = "factor"),
.rows = structure(list(1:35, 36:70), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
df2 = structure(list(SUBJID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1), DRUG = structure(c("Dexamethasone", "Dexamethasone",
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone",
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone",
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone",
"Dexamethasone", "Dexamethasone"), label = "Agent Name", format.sas = "$"),
ROUTE = structure(c("PO", "IVI", "IVI", "PO", "PO", "PO",
"PO", "PO", "IVI", "PO", "PO", "PO", "PO", "PO", "PO", "PO"
), label = "Route", format.sas = "$"), START_DT = structure(c(15455,
15469, 15470, 15471, 15472, 15473, 15474, 15475, 17653, 18061,
18079, 18115, 18122, 18129, 18136, 18142), class = "Date"),
STOP_DT = structure(c(15468, 15469, 15470, 15471, 15472,
15473, 15474, 15475, 17653, 18067, 18114, 18121, 18128, 18135,
18141, NA), class = "Date"), DAYS_DUR = c(13, 0, 0, 0, 0,
0, 0, 0, 0, 6, 35, 6, 6, 6, 5, NA), DEXEQ = c(0, 10, 30,
8, 8, 4, 2, 1, 4, 4, 4, 3.5, 3, 2.5, 2, 8), PHYSFLAG = c(0,
1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1), STATUS = c("STABLE",
"INCREASE", "INCREASE", "INCREASE", "INCREASE", "INCREASE",
"STABLE", "STABLE", "INCREASE", "INCREASE", "INCREASE", "INCREASE",
"INCREASE", "INCREASE", "STABLE", "INCREASE")), row.names = c(NA,
-16L), groups = structure(list(SUBJID = 1, .rows = structure(list(
1:16), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr",
"list"))), row.names = c(NA, -1L), class = c("tbl_df", "tbl",
"data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
example = structure(list(SUBJID = 1, READER = structure(1L, .Label = c("1",
"2", "3", "4", "5"), class = "factor"), DOS = structure(15469, class = "Date"),
VISIT = "1 Screening", MOS_DUR = 0, DAYS_DUR = 0, SPD = 1845.52,
SPD_NADIR = 1845.52, PCT_DIFF_NADIR = NA_real_, SPD_BL = 1845.52,
PCT_DIFF_BL = NA_real_, NLFLG = 0, CLINDET_DT = structure(18177, class = "Date"),
STATUS = "STABLE"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))
CodePudding user response:
Here is one solution using foverlaps
in the data.table
package (and dplyr
just for some cleaning). Basically we are treating the DOS variable as an interval here by setting same value for start and end date and then we join via overlapping intervals.
Another solution would be to full_join
by ID and then e.g. use %within%
from lubridate
but this would not preserve the rows for which there is no match.
Note that we cannot include those rows in df2
which have NA dates for the start or end variable. These are therefore removed before the join.
library(data.table)
library(dplyr)
dt1 <- as.data.table(df1 %>% mutate(start = DOS, end = DOS))
dt2 <- as.data.table(df2 %>%
select(SUBJID, start = START_DT, end =STOP_DT, STATUS) %>%
filter(!is.na(start)&!is.na(end)))
setkey(dt1, SUBJID, start, end)
setkey(dt2, SUBJID, start, end)
df3 <- foverlaps(dt1, dt2, type="within") %>%
select(-i.start, -i.end)
Tidyverse
solution
Here we apply one full join and filter to find all the matches and then one full join with the original df1
data set to keep the rows with no match at all.
library(lubridate)
df3 <- df1 %>%
full_join(df1 %>%
full_join(df2 %>%
select(SUBJID,START_DT, STOP_DT, STATUS), by = "SUBJID") %>%
filter(DOS %within% interval(START_DT, STOP_DT)))