Use R to find values for which a condition is first met-CodePudding

Consider the following sample dataset. Id is an individual identifier.

rm(list=ls()); set.seed(1)
n<-100
X<-rbinom(n, 1, 0.5) #binary covariate
j<-rep (1:n) 
dat<-data.frame(id=1:n, X)
ntp<- rep(4, n) 
mat<-matrix(ncol=3,nrow=1)
m=0; w <- mat
for(l in ntp)
{
  m=m 1
  ft<- seq(from = 2, to = 8, length.out = l)
  # ft<- seq(from = 1, to = 9, length.out = l)
  ft<-sort(ft)
  seq<-rep(ft,each=2)
  seq<-c(0,seq,10)
  matid<-cbind( matrix(seq,ncol=2,nrow=l 1,byrow=T ) ,m)
  w<-rbind(w,matid)
}
d<-data.frame(w[-1,])
colnames(d)<-c("time1","time2","id")
D <- round( merge(d,dat,by="id") ,2) #merging dataset
nr<-nrow(D)
D$Survival_time<-round(rexp(nr, 0.1) 1,3)
head(D,15)
   id time1 time2 X Survival_time
1   1     0     2 0        21.341
2   1     2     4 0        18.987
3   1     4     6 0         4.740
4   1     6     8 0        13.296
5   1     8    10 0         6.397
6   2     0     2 0        10.566
7   2     2     4 0         2.470
8   2     4     6 0        14.907
9   2     6     8 0         8.620
10  2     8    10 0        13.376
11  3     0     2 1        45.239
12  3     2     4 1        11.545
13  3     4     6 1        11.352
14  3     6     8 1        19.760
15  3     8    10 1         7.547

How can I obtain the value at which Survival_time is less that time2 for the very first time per individual. I should end up with the following values

id  Survival_time
1   4.740
2   2.470
3   7.547

Also, how can I subset the data to stop individualwise when this condition occurs. i.e obtain

   id time1 time2 X Survival_time
1   1     0     2 0        21.341
2   1     2     4 0        18.987
3   1     4     6 0         4.740

6   2     0     2 0        10.566
7   2     2     4 0         2.470

11  3     0     2 1        45.239
12  3     2     4 1        11.545
13  3     4     6 1        11.352
14  3     6     8 1        19.760
15  3     8    10 1         7.547

CodePudding user response：

You can use -

library(dplyr)

D %>%
  group_by(id) %>%
  summarise(Survival_time = Survival_time[match(TRUE, Survival_time < time2)])
  #Also using which.max 
  #summarise(Survival_time = Survival_time[which.max(Survival_time < time2)])

#     id Survival_time
#  <int>         <dbl>
#1     1          4.74
#2     2          2.47
#3     3          7.55

To select the rows you may till that point you may use -

D %>%
  group_by(id) %>%
  filter(row_number() <= match(TRUE, Survival_time < time2)) %>%
  ungroup

#      id time1 time2     X Survival_time
#   <int> <int> <int> <int>         <dbl>
# 1     1     0     2     0         21.3 
# 2     1     2     4     0         19.0 
# 3     1     4     6     0          4.74
# 4     2     0     2     0         10.6 
# 5     2     2     4     0          2.47
# 6     3     0     2     1         45.2 
# 7     3     2     4     1         11.5 
# 8     3     4     6     1         11.4 
# 9     3     6     8     1         19.8 
#10     3     8    10     1          7.55

CodePudding user response：

Slight variation:

library(dplyr)
D %>%                                # Take D, and then
  group_by(id) %>%                   # group by id, and then
  filter(Survival_time > time2) %>%  # keep Survival times > time2, and then
  slice(1) %>%                       # keep the first row per id, and then
  ungroup()                          # ungroup

CodePudding user response：

Using data.table

library(data.table)
setDT(D)[, .SD[seq_len(.N) <= which(Survival_time < time2)[1]], id]

-output

  id time1 time2 X Survival_time
 1:  1     0     2 0        21.341
 2:  1     2     4 0        18.987
 3:  1     4     6 0         4.740
 4:  2     0     2 0        10.566
 5:  2     2     4 0         2.470
 6:  3     0     2 1        45.239
 7:  3     2     4 1        11.545
 8:  3     4     6 1        11.352
 9:  3     6     8 1        19.760
10:  3     8    10 1         7.547