Home > Mobile >  R: Extract unique data with several conditions
R: Extract unique data with several conditions

Time:12-21

How to create a new data set with extracted unique ID with maximum time ≤ 4 years and their status at corresponding max time (4 years or less) and corresponding cancer variable also at max time?

I have such data: Data example

I want create such data set as in data1:Data what I want to extract

data <- structure(list(State = structure(c(1L, 1L, 1L, 1L,1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,3L, 3L, 3L, 3L, 3L), .Label = c("1", "2", "3"), class = "factor"),
                       Time = structure(1:18, .Label = c("0", "1", "2", "3", "4", "5", "0", "1", "2", "3", "0", "1", "2", "3", "4", "5", "6", "7"), class = "factor"),
                       Status = c(0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L ),
                       cancer = structure(1:18, .Label = c("1", "1", "1", "1", "1", "1",  "2", "2", "2", "2", "1", "1", "1", "1", "1", "1", "1", "1"), class = "factor")),
                  .Names = c("ID", "timeYears", "status", "cancer"),
                  class = "data.frame", row.names = c(NA, -18L))
data1 <- structure(list(State = structure(c(1L, 2L, 3L), .Label = c("1", "2", "3"), class = "factor"),
                        Time = structure(1:3, .Label = c("4", "3", "4"), class = "factor"),
                        Status = c( 1L, 0L, 0L),
                        cancer = structure(1:3, .Label = c("1", "2", "1"), class = "factor")),
                   .Names = c("ID", "timeYears", "status", "cancer"),
                   class = "data.frame", row.names = c(NA, -3L))

CodePudding user response:

dplyr

library(dplyr)
data %>%
  dplyr::filter(as.integer(as.character(timeYears)) <= 4) %>%
  group_by(ID) %>%
  slice_max(timeYears) %>%
  ungroup()
# # A tibble: 3 × 4
#   ID    timeYears status cancer
#   <fct> <fct>      <int> <fct> 
# 1 1     4              1 1     
# 2 2     3              0 2     
# 3 3     4              0 1     

base R

data[ave(as.integer(as.character(data$timeYears)), data$ID,
         FUN = function(z) z == max(z[z <= 4])) > 0,]
#    ID timeYears status cancer
# 5   1         4      1      1
# 10  2         3      0      2
# 15  3         4      0      1
  • Related