I'm trying to analyse data with dates.
Here is the code for my data:
structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L, 6L, 6L, 6L, 6L, 6L, 6L), drug = c("b", "b", "c", "b", "b", "c", "a", "a", "a", "a", "a", "b"), hospital = c(142953L, 142953L, 142953L, 12035L, 12035L, 12035L, 133163L, 133163L, 133163L, 133163L, 133163L, 133163L), start_date.y = structure(c(12173, 12204, 12753, 12311, 12341, 12400, 12877, 12907, 12938, 13091, 13121, 13152), class = "Date"), total.price = c(100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L, 100L), Diff = c(89L, 31L, 549L, 0L, 30L, 59L, 31L, 30L, 31L, 153L, 30L, 31L), discontinuation = c("0", "0", "1", "0", "0", "0", "0", "0", "0", "1", "0", "0")), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA, -12L), groups = structure(list(id = c(1L, 2L, 6L), .rows = structure(list(1:3, 4:6, 7:12), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -3L), .drop = TRUE))
What I'm trying to calculate is the start date and end date of the drug prescriptions for each id.
First, I grouped data by "id" and "drug" variables.
- If the variable 'discontinuation' ==1, the person's end date will be the discontinuation date.
- If the variable 'discontinuation' == 0, the person's end date will be the last date of the prescriptions(max(start_date)).
I tried to calculate this by the code below.
bio_exp_dc <-bio_exp_dc %>% group_by(id) %>% summarise(start=min(start_date.y,na.rm = TRUE),end= ifelse(discontinuation==1,start_date.y,max(start_date.y)))
However, the following error occured:
>Error in UseMethod("summarise") :
no applicable method for 'summarise' applied to an object of class "Date"
In addition: Warning message:
In min.default(c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, :
no non-missing arguments to min; returning Inf
CodePudding user response:
You could try aggregate
with range
. We first get numbers (days since onset of UNIX epoch) but may easily restore "Date"
format.
bio <- transform(bio, disc=ave(discontinuation, id, drug, FUN=cumsum))
a <- aggregate(start_date.y ~ id drug, bio[bio$disc == 0, ], range) |> do.call(what=data.frame)
dt <- grep('date', names(a))
a[dt] <- lapply(a[dt], as.Date, origin='1970-01-01')
a
# id drug start_date.y.1 start_date.y.2
# 1 6 a 2005-04-04 2005-06-04
# 2 1 b 2003-05-01 2003-06-01
# 3 2 b 2003-09-16 2003-10-16
# 4 6 b 2006-01-04 2006-01-04
# 5 2 c 2003-12-14 2003-12-14
Data:
bio <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L, 6L, 6L, 6L, 6L,
6L, 6L), drug = c("b", "b", "c", "b", "b", "c", "a", "a", "a",
"a", "a", "b"), hospital = c(142953L, 142953L, 142953L, 12035L,
12035L, 12035L, 133163L, 133163L, 133163L, 133163L, 133163L,
133163L), start_date.y = structure(c(12173, 12204, 12753, 12311,
12341, 12400, 12877, 12907, 12938, 13091, 13121, 13152), class = "Date"),
total.price = c(100L, 100L, 100L, 100L, 100L, 100L, 100L,
100L, 100L, 100L, 100L, 100L), Diff = c(89L, 31L, 549L, 0L,
30L, 59L, 31L, 30L, 31L, 153L, 30L, 31L), discontinuation = c("0",
"0", "1", "0", "0", "0", "0", "0", "0", "1", "0", "0")), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -12L), groups = structure(list(
id = c(1L, 2L, 6L), .rows = structure(list(1:3, 4:6, 7:12), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -3L), .drop = TRUE))
CodePudding user response:
df %>%
group_by(id, drug) %>%
summarise(
start = min(start_date.y, na.rm = TRUE),
end = if_else(any(discontinuation == 1), start_date.y[match(1, discontinuation)], max(start_date.y))
)
# A tibble: 6 × 4
# Groups: id [3]
id drug start end
<int> <chr> <date> <date>
1 1 b 2003-05-01 2003-06-01
2 1 c 2004-12-01 2004-12-01
3 2 b 2003-09-16 2003-10-16
4 2 c 2003-12-14 2003-12-14
5 6 a 2005-04-04 2005-11-04
6 6 b 2006-01-04 2006-01-04