Home > Software engineering >  Replacing missing dates with dates from other rows based on a value in another column
Replacing missing dates with dates from other rows based on a value in another column

Time:02-24

I have a dataset with dates of birth. Sometimes the dates of birth are missing. If MG_FID is the same, this means that these individuals are siblings and father_dob and mother_dob are the same. So for those IIDs, for who MG_FID is the same, I want to replace the NAs in mother_dob and father_dob.

mg_fid<-structure(list(IID = c(1101480L, 1101481L, 1101509L, 1101554L, 
1101600L, 1101619L, 1101623L, 1101625L, 1101637L, 1101639L, 1101644L, 
1101647L, 1101650L, 1101652L, 1101654L, 1101655L, 1101656L, 1101659L, 
1101660L, 1101661L, 1101662L, 1101674L, 1101675L, 1101678L, 1101679L, 
1101681L, 1101685L, 1101692L, 1101693L, 1101694L, 1101696L, 1101701L, 
1101705L, 1101709L, 1101712L, 1101715L, 1101717L, 1101718L, 1101720L, 
1131480L, 1131481L, 1131554L, 1131600L, 1131619L, 1131623L, 1131625L, 
1131639L, 1131644L, 1131652L, 1131659L, 1131660L, 1131661L, 1131678L, 
1131679L, 1131685L, 1131692L, 1131696L, 1131701L, 1131705L, 1131709L, 
1131715L, 1131717L, 1131718L, 1193598L, 1193602L, 1193607L, 1193609L, 
1193612L, 1193617L, 1193618L, 1193620L, 1193621L, 1193622L, 1193634L, 
1193638L, 1193639L, 1193640L, 1193642L, 1193653L, 1193656L, 1193659L, 
1193660L, 1193661L, 1193664L, 1193667L, 1193668L, 1193689L, 2202638L, 
2202647L, 2202652L, 2202655L, 2202660L, 2202661L, 2202665L, 2202673L, 
2202686L, 2202693L, 2232655L, 2232661L, 2232665L, 2232686L, 2296211L, 
2296214L, 2296217L, 3305120L, 3305129L, 3305135L, 3305139L, 3305141L, 
3305144L, 3305151L, 3305153L, 3305154L, 3305161L, 3305164L, 3305168L, 
3335154L, 3345153L, 3392379L, 3392380L, 3392385L, 3392389L, 3392390L, 
3392391L, 3392392L, 3392395L, 3392400L, 4406324L, 4406326L, 4406327L, 
4406339L, 4406341L, 4406343L, 4406349L, 4406352L, 4406356L, 4406366L, 
4406370L, 4406382L, 4406391L, 4406395L, 4406396L, 4406398L, 4406403L, 
4406409L, 4406416L, 4406421L, 4406422L, 4406425L, 4406436L, 4406438L, 
4406460L, 4406467L, 4406469L, 4406480L, 4406483L, 4406484L, 4406485L, 
4436326L, 4436366L, 4436382L, 4436396L, 4436416L, 4436422L, 4436425L, 
4436436L, 4436467L, 4436469L, 4436480L, 4436483L, 4494908L, 4494910L, 
4494916L, 4494919L), child_dob = structure(c(15454, 15405, 15112, 
14371, 14789, 15154, 12628, 15548, 12871, 16199, 14206, 13984, 
13718, 14602, 15889, 16216, 13309, 15490, 13870, 12242, 12740, 
14956, 11581, 14433, 16346, 14208, 16288, 12957, 15932, 13314, 
11487, 11768, 14650, 16247, 12741, 15075, 14823, 14160, 13284, 
14798, 16300, 14803, 14049, NA, 13496, 16183, NA, 14206, 11675, 
11226, 14630, 13035, 13767, 15667, 15201, NA, 12014, 12646, 15524, 
16247, 16884, 15722, 14792, 16310, 16959, 16173, 16765, 15535, 
NA, 16659, 14909, 16004, 15088, 15293, 15844, 14653, 16304, 15694, 
15958, 14993, 15274, 15332, 16363, 14846, 16304, 16597, 15226, 
11404, 15126, 12299, 14852, 15606, 15884, 16515, 15475, 15982, 
13819, 15996, 16512, 14823, 16631, 14442, 16672, 14120, 16499, 
15269, 13863, 16755, 15690, 15380, 13232, 13482, 13997, 16924, 
13431, 14692, 14789, 12156, 15839, 16585, 15884, 13266, 16027, 
13214, 14770, 13228, 15578, 13892, 15133, 12759, 13151, 14447, 
14922, 14841, 14572, 15804, 15260, 14869, 11111, 15579, 13473, 
15099, 15588, 11801, 13915, 14692, 12526, 14145, 13782, 15131, 
14759, 13354, 12747, 13389, 13530, 15554, 15211, 11365, NA, NA, 
12577, 15479, NA, 14759, NA, 16528, NA, 11559, NA, NA, NA, NA, 
NA, NA), class = "Date"), mother_dob = structure(c(3751, 4152, 
2012, 4033, 3220, 3547, 226, 4627, 936, 3971, 488, -751, NA, 
-152, 7709, 6247, 628, 1515, 2679, 1220, 982, 3470, -480, 1034, 
7168, 1783, 3356, 2352, 4166, 892, 1601, 1002, 3436, 3581, 2842, 
NA, 7439, 1380, -476, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7939, 
7973, NA, 7396, 8002, NA, 8499, 7879, 7512, 7512, 7495, 7919, 
7503, 7412, 8069, 7823, 7321, 7399, 7686, 7682, 7504, 7738, 9053, 
7579, 1088, 3346, 2276, 1738, 3566, 5392, 1106, 214, 2348, 1863, 
NA, NA, NA, NA, 999, NA, 4484, 8885, 1958, 3197, 5721, 2815, 
5304, 5451, 2145, 5354, 4758, 2517, 1557, 5354, 2145, 6377, 6011, 
4084, -406, 332, 390, -716, 2218, 3982, 1251, 2842, 3158, 2008, 
5332, 5872, 1643, 7342, 3350, 5199, 1065, -2651, 6334, NA, 6508, 
2554, -835, 1502, 132, 1409, 4033, 3289, 1888, 2864, 1766, 629, 
739, 3158, 1915, 778, 3781, NA, 5199, NA, 6508, 132, 4033, 3289, 
1888, NA, NA, 3158, NA, 7075, NA, 2985, 431), class = "Date"), 
    father_dob = structure(c(3048, 4252, -485, 4363, 536, 565, 
    56, 1953, -534, 3535, -4109, -2086, NA, -321, 4858, 4287, 
    2169, -895, 2459, 873, 35, 3127, -1333, -73, 3866, 1573, 
    3192, 1424, -3378, -182, -139, 443, 3353, 5299, 97, NA, 5978, 
    1369, -3093, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7606, 
    NA, 6055, 7062, NA, 7893, 8325, 7297, 7297, 6510, 7849, 6883, 
    7503, 7536, 7356, 6338, NA, 7600, 7908, 7339, 5150, 7532, 
    7128, 511, 454, 2154, 2348, 2703, 4071, 1491, -1679, 3894, 
    -1475, NA, NA, NA, NA, -179, NA, 2941, 6677, 1067, -1206, 
    4403, 981, 3626, 437, 829, 2087, 2980, 2364, 653, 2087, 829, 
    6281, 6839, 4344, -3220, 3175, 335, 410, 1961, 2730, -951, 
    2670, 2938, -3676, 1977, 5758, 1154, 7582, 3564, 3297, 4148, 
    -2770, 5088, NA, 7605, 4096, -1145, 948, 1022, 560, 3016, 
    -1902, 2022, -703, 40, 446, 91, 1807, 1433, 4119, NA, NA, 
    3297, NA, 7605, 1022, 3016, -1902, 2022, NA, NA, 1807, NA, 
    5903, NA, 2521, 849), class = "Date"), MG_FID = c(1L, 2L, 
    0L, 3L, 4L, 5L, 7L, 8L, 0L, 9L, 10L, 0L, 0L, 11L, 0L, 0L, 
    0L, 12L, 13L, 14L, 0L, 0L, 0L, 15L, 16L, 17L, 18L, 19L, 20L, 
    0L, 21L, 22L, 23L, 24L, 0L, 25L, 26L, 27L, 0L, 1L, 2L, 3L, 
    4L, 5L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 18L, 
    19L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 28L, 0L, 29L, 30L, 0L, 31L, 0L, 
    28L, 29L, 30L, 31L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 32L, 
    35L, 33L, 0L, 0L, 34L, 33L, 35L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 36L, 37L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 38L, 0L, 
    39L, 0L, 0L, 40L, 41L, 0L, 0L, 42L, 0L, 43L, 44L, 45L, 0L, 
    46L, 47L, 48L, 49L, 50L, 0L, 51L, 37L, 38L, 39L, 40L, 42L, 
    43L, 44L, 45L, 47L, 48L, 49L, 50L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA, 
-174L))

mg_fid

I tried the code below, but it turns all my dates into random numbers.

parent_age %>% group_by(MG_FID) %>% 
mutate(father_dob = ifelse(is.na(father_dob),max(father_dob,na.rm=TRUE),father_dob), 
mother_dob = ifelse(is.na(mother_dob),max(mother_dob,na.rm=TRUE),mother_dob))

Does anyone have any ideas? Let me know if what I wrote doesn't make sense.

CodePudding user response:

# ensure mg_fid is tibble
mg_fid <- mg_fid %>% as_tibble(mg_fid)

# mutate columns ending in "r_dob", exclude the rows where MG_FID==0
mg_fid_new <- bind_rows(
  mg_fid %>% filter(MG_FID==0), 
  mg_fid %>% filter(MG_FID!=0) %>% 
    group_by(MG_FID) %>%
    mutate(across(.cols = ends_with("r_dob"),~mean(.x, na.rm=T)))
)

Input: as above updated by OP

Output:

# A tibble: 174 x 5
       IID child_dob  mother_dob father_dob MG_FID
     <int> <date>     <date>     <date>      <int>
 1 1101509 2011-05-18 1975-07-06 1968-09-03      0
 2 1101637 2005-03-29 1972-07-25 1968-07-16      0
 3 1101647 2008-04-15 1967-12-12 1964-04-16      0
 4 1101650 2007-07-24 NA         NA              0
 5 1101654 2013-07-03 1991-02-09 1983-04-21      0
 6 1101655 2014-05-26 1987-02-08 1981-09-27      0
 7 1101656 2006-06-10 1971-09-21 1975-12-10      0
 8 1101662 2004-11-18 1972-09-09 1970-02-05      0
 9 1101674 2010-12-13 1979-07-03 1978-07-25      0
10 1101675 2001-09-16 1968-09-08 1966-05-09      0
# ... with 164 more rows

The number of NA's has been reduced in the mother and father dob columns

> sapply(mg_fid, function(x) sum(is.na(x)))
       IID  child_dob mother_dob father_dob     MG_FID 
         0         15         40         43          0 
> sapply(mg_fid_new, function(x) sum(is.na(x)))
       IID  child_dob mother_dob father_dob     MG_FID 
         0         15          8         11          0 

CodePudding user response:

The NA values can be replaced by using a combination of group_by and fill, like so:

library(tidyverse)
library(janitor)
#> 
#> Attaching package: 'janitor'
#> The following objects are masked from 'package:stats':
#> 
#>     chisq.test, fisher.test

data<-structure(list(IID = c(1101480L, 1101481L, 1101509L, 1101554L, 
                               1101600L, 1101619L, 1101623L, 1101625L, 1101637L, 1101639L, 1101644L, 
                               1101647L, 1101650L, 1101652L, 1101654L, 1101655L, 1101656L, 1101659L, 
                               1101660L, 1101661L, 1101662L, 1101674L, 1101675L, 1101678L, 1101679L, 
                               1101681L, 1101685L, 1101692L, 1101693L, 1101694L, 1101696L, 1101701L, 
                               1101705L, 1101709L, 1101712L, 1101715L, 1101717L, 1101718L, 1101720L, 
                               1131480L, 1131481L, 1131554L, 1131600L, 1131619L, 1131623L, 1131625L, 
                               1131639L, 1131644L, 1131652L, 1131659L, 1131660L, 1131661L, 1131678L, 
                               1131679L, 1131685L, 1131692L, 1131696L, 1131701L, 1131705L, 1131709L, 
                               1131715L, 1131717L, 1131718L, 1193598L, 1193602L, 1193607L, 1193609L, 
                               1193612L, 1193617L, 1193618L, 1193620L, 1193621L, 1193622L, 1193634L, 
                               1193638L, 1193639L, 1193640L, 1193642L, 1193653L, 1193656L, 1193659L, 
                               1193660L, 1193661L, 1193664L, 1193667L, 1193668L, 1193689L, 2202638L, 
                               2202647L, 2202652L, 2202655L, 2202660L, 2202661L, 2202665L, 2202673L, 
                               2202686L, 2202693L, 2232655L, 2232661L, 2232665L, 2232686L, 2296211L, 
                               2296214L, 2296217L, 3305120L, 3305129L, 3305135L, 3305139L, 3305141L, 
                               3305144L, 3305151L, 3305153L, 3305154L, 3305161L, 3305164L, 3305168L, 
                               3335154L, 3345153L, 3392379L, 3392380L, 3392385L, 3392389L, 3392390L, 
                               3392391L, 3392392L, 3392395L, 3392400L, 4406324L, 4406326L, 4406327L, 
                               4406339L, 4406341L, 4406343L, 4406349L, 4406352L, 4406356L, 4406366L, 
                               4406370L, 4406382L, 4406391L, 4406395L, 4406396L, 4406398L, 4406403L, 
                               4406409L, 4406416L, 4406421L, 4406422L, 4406425L, 4406436L, 4406438L, 
                               4406460L, 4406467L, 4406469L, 4406480L, 4406483L, 4406484L, 4406485L, 
                               4436326L, 4436366L, 4436382L, 4436396L, 4436416L, 4436422L, 4436425L, 
                               4436436L, 4436467L, 4436469L, 4436480L, 4436483L, 4494908L, 4494910L, 
                               4494916L, 4494919L), child_dob = structure(c(15454, 15405, 15112, 
                                                                            14371, 14789, 15154, 12628, 15548, 12871, 16199, 14206, 13984, 
                                                                            13718, 14602, 15889, 16216, 13309, 15490, 13870, 12242, 12740, 
                                                                            14956, 11581, 14433, 16346, 14208, 16288, 12957, 15932, 13314, 
                                                                            11487, 11768, 14650, 16247, 12741, 15075, 14823, 14160, 13284, 
                                                                            14798, 16300, 14803, 14049, NA, 13496, 16183, NA, 14206, 11675, 
                                                                            11226, 14630, 13035, 13767, 15667, 15201, NA, 12014, 12646, 15524, 
                                                                            16247, 16884, 15722, 14792, 16310, 16959, 16173, 16765, 15535, 
                                                                            NA, 16659, 14909, 16004, 15088, 15293, 15844, 14653, 16304, 15694, 
                                                                            15958, 14993, 15274, 15332, 16363, 14846, 16304, 16597, 15226, 
                                                                            11404, 15126, 12299, 14852, 15606, 15884, 16515, 15475, 15982, 
                                                                            13819, 15996, 16512, 14823, 16631, 14442, 16672, 14120, 16499, 
                                                                            15269, 13863, 16755, 15690, 15380, 13232, 13482, 13997, 16924, 
                                                                            13431, 14692, 14789, 12156, 15839, 16585, 15884, 13266, 16027, 
                                                                            13214, 14770, 13228, 15578, 13892, 15133, 12759, 13151, 14447, 
                                                                            14922, 14841, 14572, 15804, 15260, 14869, 11111, 15579, 13473, 
                                                                            15099, 15588, 11801, 13915, 14692, 12526, 14145, 13782, 15131, 
                                                                            14759, 13354, 12747, 13389, 13530, 15554, 15211, 11365, NA, NA, 
                                                                            12577, 15479, NA, 14759, NA, 16528, NA, 11559, NA, NA, NA, NA, 
                                                                            NA, NA), class = "Date"), mother_dob = structure(c(3751, 4152, 
                                                                                                                               2012, 4033, 3220, 3547, 226, 4627, 936, 3971, 488, -751, NA, 
                                                                                                                               -152, 7709, 6247, 628, 1515, 2679, 1220, 982, 3470, -480, 1034, 
                                                                                                                               7168, 1783, 3356, 2352, 4166, 892, 1601, 1002, 3436, 3581, 2842, 
                                                                                                                               NA, 7439, 1380, -476, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
                                                                                                                               NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7939, 
                                                                                                                               7973, NA, 7396, 8002, NA, 8499, 7879, 7512, 7512, 7495, 7919, 
                                                                                                                               7503, 7412, 8069, 7823, 7321, 7399, 7686, 7682, 7504, 7738, 9053, 
                                                                                                                               7579, 1088, 3346, 2276, 1738, 3566, 5392, 1106, 214, 2348, 1863, 
                                                                                                                               NA, NA, NA, NA, 999, NA, 4484, 8885, 1958, 3197, 5721, 2815, 
                                                                                                                               5304, 5451, 2145, 5354, 4758, 2517, 1557, 5354, 2145, 6377, 6011, 
                                                                                                                               4084, -406, 332, 390, -716, 2218, 3982, 1251, 2842, 3158, 2008, 
                                                                                                                               5332, 5872, 1643, 7342, 3350, 5199, 1065, -2651, 6334, NA, 6508, 
                                                                                                                               2554, -835, 1502, 132, 1409, 4033, 3289, 1888, 2864, 1766, 629, 
                                                                                                                               739, 3158, 1915, 778, 3781, NA, 5199, NA, 6508, 132, 4033, 3289, 
                                                                                                                               1888, NA, NA, 3158, NA, 7075, NA, 2985, 431), class = "Date"), 
                       father_dob = structure(c(3048, 4252, -485, 4363, 536, 565, 
                                                56, 1953, -534, 3535, -4109, -2086, NA, -321, 4858, 4287, 
                                                2169, -895, 2459, 873, 35, 3127, -1333, -73, 3866, 1573, 
                                                3192, 1424, -3378, -182, -139, 443, 3353, 5299, 97, NA, 5978, 
                                                1369, -3093, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
                                                NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7606, 
                                                NA, 6055, 7062, NA, 7893, 8325, 7297, 7297, 6510, 7849, 6883, 
                                                7503, 7536, 7356, 6338, NA, 7600, 7908, 7339, 5150, 7532, 
                                                7128, 511, 454, 2154, 2348, 2703, 4071, 1491, -1679, 3894, 
                                                -1475, NA, NA, NA, NA, -179, NA, 2941, 6677, 1067, -1206, 
                                                4403, 981, 3626, 437, 829, 2087, 2980, 2364, 653, 2087, 829, 
                                                6281, 6839, 4344, -3220, 3175, 335, 410, 1961, 2730, -951, 
                                                2670, 2938, -3676, 1977, 5758, 1154, 7582, 3564, 3297, 4148, 
                                                -2770, 5088, NA, 7605, 4096, -1145, 948, 1022, 560, 3016, 
                                                -1902, 2022, -703, 40, 446, 91, 1807, 1433, 4119, NA, NA, 
                                                3297, NA, 7605, 1022, 3016, -1902, 2022, NA, NA, 1807, NA, 
                                                5903, NA, 2521, 849), class = "Date"), MG_FID = c(1L, 2L, 
                                                                                                  0L, 3L, 4L, 5L, 7L, 8L, 0L, 9L, 10L, 0L, 0L, 11L, 0L, 0L, 
                                                                                                  0L, 12L, 13L, 14L, 0L, 0L, 0L, 15L, 16L, 17L, 18L, 19L, 20L, 
                                                                                                  0L, 21L, 22L, 23L, 24L, 0L, 25L, 26L, 27L, 0L, 1L, 2L, 3L, 
                                                                                                  4L, 5L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 18L, 
                                                                                                  19L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 0L, 0L, 0L, 0L, 0L, 
                                                                                                  0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
                                                                                                  0L, 0L, 0L, 0L, 0L, 0L, 0L, 28L, 0L, 29L, 30L, 0L, 31L, 0L, 
                                                                                                  28L, 29L, 30L, 31L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 32L, 
                                                                                                  35L, 33L, 0L, 0L, 34L, 33L, 35L, 0L, 0L, 0L, 0L, 0L, 0L, 
                                                                                                  0L, 0L, 0L, 36L, 37L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 38L, 0L, 
                                                                                                  39L, 0L, 0L, 40L, 41L, 0L, 0L, 42L, 0L, 43L, 44L, 45L, 0L, 
                                                                                                  46L, 47L, 48L, 49L, 50L, 0L, 51L, 37L, 38L, 39L, 40L, 42L, 
                                                                                                  43L, 44L, 45L, 47L, 48L, 49L, 50L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                           -174L))

data <- data %>% 
  as_tibble() %>% 
  clean_names()

data %>% 
  group_by(mg_fid) %>% 
  fill(mother_dob:father_dob, .direction = "downup") %>% 
  ungroup()
#> # A tibble: 174 x 5
#>        iid child_dob  mother_dob father_dob mg_fid
#>      <int> <date>     <date>     <date>      <int>
#>  1 1101480 2012-04-24 1980-04-09 1978-05-07      1
#>  2 1101481 2012-03-06 1981-05-15 1981-08-23      2
#>  3 1101509 2011-05-18 1975-07-06 1968-09-03      0
#>  4 1101554 2009-05-07 1981-01-16 1981-12-12      3
#>  5 1101600 2010-06-29 1978-10-26 1971-06-21      4
#>  6 1101619 2011-06-29 1979-09-18 1971-07-20      5
#>  7 1101623 2004-07-29 1970-08-15 1970-02-26      7
#>  8 1101625 2012-07-27 1982-09-02 1975-05-08      8
#>  9 1101637 2005-03-29 1972-07-25 1968-07-16      0
#> 10 1101639 2014-05-09 1980-11-15 1979-09-06      9
#> # ... with 164 more rows

Created on 2022-02-24 by the reprex package (v2.0.1)

One question though - the mg_fid == 0 group has 82 entries, where 6 mother_dob are missing and 8 father_dob are missing. How are you planning to replace those NAs?

data %>% 
  count(mg_fid, sort = TRUE)

# A tibble: 51 x 2
   mg_fid     n
    <int> <int>
 1      0    82
 2      1     2
 3      2     2
 4      3     2
 5      4     2
 6      5     2
 7      7     2
 8      8     2
 9      9     2
10     10     2
# ... with 41 more rows

data %>% 
  filter(mg_fid == 0) %>% 
  map_df(~ sum(is.na(.)))

# A tibble: 1 x 5
    iid child_dob mother_dob father_dob mg_fid
  <int>     <int>      <int>      <int>  <int>
1     0         5          6          8      0
  • Related