Repeat rows in dataframe and change 1 score until a condition is reached by each ID-CodePudding

Can't seem to find any answer to this question. I have no idea what is the best way to do it.

This is a dummy dataframe:

ID	year	age	death
1	1999	18	2015
2	1984	25
3	2012	24

What i want to do is to repeat each row and add 1 to year and age until one of a set of condition is reached

year == 2018 or year == death

Ultimately so the new dataframe looks like this:

ID	year	age	death
1	1999	18	2015
1	2000	19	2015
1	2001	20	2015
1	2002	21	2015
1	2003	22	2015
1	2004	23	2015
1	2005	24	2015
1	2006	25	2015
1	2007	26	2015
1	2008	27	2015
1	2009	28	2015
1	2010	29	2015
1	2011	30	2015
1	2012	31	2015
1	2013	32	2015
1	2014	33	2015
1	2015	34	2015
2	2010	25
2	2011	26
2	2012	27
2	2013	28
2	2014	29
2	2015	30
2	2016	31
2	2017	32
2	2018	33
3	2012	24
3	2013	25
3	2014	26
3	2015	27
3	2016	28
3	2017	29
3	2018	30

Any suggestions?

Sincerely, A desperate newbie epidemiologist

CodePudding user response：

A dplyr solution:

library(dplyr)

df %>%
  group_by(ID) %>% 
  summarise(year = year:min(death, 2018, na.rm = TRUE),
            age = age   1:length(year) - 1, death) %>%
  ungroup()

Output

# A tibble: 59 × 4
      ID  year   age death
   <int> <int> <dbl> <int>
 1     1  1999    18  2015
 2     1  2000    19  2015
 3     1  2001    20  2015
 4     1  2002    21  2015
 5     1  2003    22  2015
 6     1  2004    23  2015
 7     1  2005    24  2015
 8     1  2006    25  2015
 9     1  2007    26  2015
10     1  2008    27  2015
11     1  2009    28  2015
12     1  2010    29  2015
13     1  2011    30  2015
14     1  2012    31  2015
15     1  2013    32  2015
16     1  2014    33  2015
17     1  2015    34  2015
18     2  1984    25    NA
19     2  1985    26    NA
20     2  1986    27    NA
21     2  1987    28    NA
22     2  1988    29    NA
23     2  1989    30    NA
24     2  1990    31    NA
25     2  1991    32    NA
26     2  1992    33    NA
27     2  1993    34    NA
28     2  1994    35    NA
29     2  1995    36    NA
30     2  1996    37    NA
31     2  1997    38    NA
32     2  1998    39    NA
33     2  1999    40    NA
34     2  2000    41    NA
35     2  2001    42    NA
36     2  2002    43    NA
37     2  2003    44    NA
38     2  2004    45    NA
39     2  2005    46    NA
40     2  2006    47    NA
41     2  2007    48    NA
42     2  2008    49    NA
43     2  2009    50    NA
44     2  2010    51    NA
45     2  2011    52    NA
46     2  2012    53    NA
47     2  2013    54    NA
48     2  2014    55    NA
49     2  2015    56    NA
50     2  2016    57    NA
51     2  2017    58    NA
52     2  2018    59    NA
53     3  2012    24    NA
54     3  2013    25    NA
55     3  2014    26    NA
56     3  2015    27    NA
57     3  2016    28    NA
58     3  2017    29    NA
59     3  2018    30    NA

CodePudding user response：

Using data.table:

library(data.table)
yr_max = 2018L
chnalocf = \(x) x[nafill(replace(seq_along(x), is.na(x), NA), "locf")]

DT = DT[, 
        .SD[CJ(year = min(year):min(yr_max, death, na.rm = TRUE)), on = 'year'], 
        by = ID]
DT[, age   := seq(first(age), first(age)   .N - 1L), by = ID]
DT[, death := first(death), by = ID]

Result:

        ID  year   age death
#     <int> <int> <int> <int>
#  1:     1  1999    18  2015
#  2:     1  2000    19  2015
#  3:     1  2001    20  2015
#  4:     1  2002    21  2015
#  5:     1  2003    22  2015
#  6:     1  2004    23  2015
#  7:     1  2005    24  2015
#  8:     1  2006    25  2015
#  9:     1  2007    26  2015
# 10:     1  2008    27  2015
# 11:     1  2009    28  2015
# 12:     1  2010    29  2015
# 13:     1  2011    30  2015
# 14:     1  2012    31  2015
# 15:     1  2013    32  2015
# 16:     1  2014    33  2015
# 17:     1  2015    34  2015
# 18:     2  1984    25    NA
# 19:     2  1985    26    NA
# 20:     2  1986    27    NA
# 21:     2  1987    28    NA
# 22:     2  1988    29    NA
# 23:     2  1989    30    NA
# 24:     2  1990    31    NA
# 25:     2  1991    32    NA
# 26:     2  1992    33    NA
# 27:     2  1993    34    NA
# 28:     2  1994    35    NA
# 29:     2  1995    36    NA
# 30:     2  1996    37    NA
# 31:     2  1997    38    NA
# 32:     2  1998    39    NA
# 33:     2  1999    40    NA
# 34:     2  2000    41    NA
# 35:     2  2001    42    NA
# 36:     2  2002    43    NA
# 37:     2  2003    44    NA
# 38:     2  2004    45    NA
# 39:     2  2005    46    NA
# 40:     2  2006    47    NA
# 41:     2  2007    48    NA
# 42:     2  2008    49    NA
# 43:     2  2009    50    NA
# 44:     2  2010    51    NA
# 45:     2  2011    52    NA
# 46:     2  2012    53    NA
# 47:     2  2013    54    NA
# 48:     2  2014    55    NA
# 49:     2  2015    56    NA
# 50:     2  2016    57    NA
# 51:     2  2017    58    NA
# 52:     2  2018    59    NA
# 53:     3  2012    24    NA
# 54:     3  2013    25    NA
# 55:     3  2014    26    NA
# 56:     3  2015    27    NA
# 57:     3  2016    28    NA
# 58:     3  2017    29    NA
# 59:     3  2018    30    NA
#        ID  year   age death

Reproducible data:

DT = data.table(
  ID    = 1:3,
  year  = c(1999L, 1984L, 2012L),
  age   = c(18L, 25L, 24L),
  death = c(2015L, NA, NA) 
)

CodePudding user response：

Using by.

by(dat, dat$ID, \(x) {
  if (is.na(x$death)) u <- 2018 else u <- x$death[1]
  year <- x$year[1]:u
  age <- (year - x$year[1])   x$age[1]
  cbind(x[setdiff(names(x), c('age', 'year'))], year, age, row.names=NULL)[c(1, 3, 4, 2)]
}) |> c(make.row.names=FALSE) |> do.call(what=rbind)
#    ID year age death
# 1   1 1999  18  2015
# 2   1 2000  19  2015
# 3   1 2001  20  2015
# 4   1 2002  21  2015
# 5   1 2003  22  2015
# 6   1 2004  23  2015
# 7   1 2005  24  2015
# 8   1 2006  25  2015
# 9   1 2007  26  2015
# 10  1 2008  27  2015
# 11  1 2009  28  2015
# 12  1 2010  29  2015
# 13  1 2011  30  2015
# 14  1 2012  31  2015
# 15  1 2013  32  2015
# 16  1 2014  33  2015
# 17  1 2015  34  2015
# 18  2 1984  25    NA
# 19  2 1985  26    NA
# 20  2 1986  27    NA
# 21  2 1987  28    NA
# 22  2 1988  29    NA
# 23  2 1989  30    NA
# 24  2 1990  31    NA
# 25  2 1991  32    NA
# 26  2 1992  33    NA
# 27  2 1993  34    NA
# 28  2 1994  35    NA
# 29  2 1995  36    NA
# 30  2 1996  37    NA
# 31  2 1997  38    NA
# 32  2 1998  39    NA
# 33  2 1999  40    NA
# 34  2 2000  41    NA
# 35  2 2001  42    NA
# 36  2 2002  43    NA
# 37  2 2003  44    NA
# 38  2 2004  45    NA
# 39  2 2005  46    NA
# 40  2 2006  47    NA
# 41  2 2007  48    NA
# 42  2 2008  49    NA
# 43  2 2009  50    NA
# 44  2 2010  51    NA
# 45  2 2011  52    NA
# 46  2 2012  53    NA
# 47  2 2013  54    NA
# 48  2 2014  55    NA
# 49  2 2015  56    NA
# 50  2 2016  57    NA
# 51  2 2017  58    NA
# 52  2 2018  59    NA
# 53  3 2012  24    NA
# 54  3 2013  25    NA
# 55  3 2014  26    NA
# 56  3 2015  27    NA
# 57  3 2016  28    NA
# 58  3 2017  29    NA
# 59  3 2018  30    NA

Data:

dat <- structure(list(ID = 1:3, year = c(1999L, 1984L, 2012L), age = c(18L, 
25L, 24L), death = c(2015L, NA, NA)), class = "data.frame", row.names = c(NA, 
-3L))