I have a dataset with dates of birth. Sometimes the dates of birth are missing. If MG_FID is the same, this means that these individuals are siblings and father_dob and mother_dob are the same. So for those IIDs, for who MG_FID is the same, I want to replace the NAs in mother_dob and father_dob.
mg_fid<-structure(list(IID = c(1101480L, 1101481L, 1101509L, 1101554L,
1101600L, 1101619L, 1101623L, 1101625L, 1101637L, 1101639L, 1101644L,
1101647L, 1101650L, 1101652L, 1101654L, 1101655L, 1101656L, 1101659L,
1101660L, 1101661L, 1101662L, 1101674L, 1101675L, 1101678L, 1101679L,
1101681L, 1101685L, 1101692L, 1101693L, 1101694L, 1101696L, 1101701L,
1101705L, 1101709L, 1101712L, 1101715L, 1101717L, 1101718L, 1101720L,
1131480L, 1131481L, 1131554L, 1131600L, 1131619L, 1131623L, 1131625L,
1131639L, 1131644L, 1131652L, 1131659L, 1131660L, 1131661L, 1131678L,
1131679L, 1131685L, 1131692L, 1131696L, 1131701L, 1131705L, 1131709L,
1131715L, 1131717L, 1131718L, 1193598L, 1193602L, 1193607L, 1193609L,
1193612L, 1193617L, 1193618L, 1193620L, 1193621L, 1193622L, 1193634L,
1193638L, 1193639L, 1193640L, 1193642L, 1193653L, 1193656L, 1193659L,
1193660L, 1193661L, 1193664L, 1193667L, 1193668L, 1193689L, 2202638L,
2202647L, 2202652L, 2202655L, 2202660L, 2202661L, 2202665L, 2202673L,
2202686L, 2202693L, 2232655L, 2232661L, 2232665L, 2232686L, 2296211L,
2296214L, 2296217L, 3305120L, 3305129L, 3305135L, 3305139L, 3305141L,
3305144L, 3305151L, 3305153L, 3305154L, 3305161L, 3305164L, 3305168L,
3335154L, 3345153L, 3392379L, 3392380L, 3392385L, 3392389L, 3392390L,
3392391L, 3392392L, 3392395L, 3392400L, 4406324L, 4406326L, 4406327L,
4406339L, 4406341L, 4406343L, 4406349L, 4406352L, 4406356L, 4406366L,
4406370L, 4406382L, 4406391L, 4406395L, 4406396L, 4406398L, 4406403L,
4406409L, 4406416L, 4406421L, 4406422L, 4406425L, 4406436L, 4406438L,
4406460L, 4406467L, 4406469L, 4406480L, 4406483L, 4406484L, 4406485L,
4436326L, 4436366L, 4436382L, 4436396L, 4436416L, 4436422L, 4436425L,
4436436L, 4436467L, 4436469L, 4436480L, 4436483L, 4494908L, 4494910L,
4494916L, 4494919L), child_dob = structure(c(15454, 15405, 15112,
14371, 14789, 15154, 12628, 15548, 12871, 16199, 14206, 13984,
13718, 14602, 15889, 16216, 13309, 15490, 13870, 12242, 12740,
14956, 11581, 14433, 16346, 14208, 16288, 12957, 15932, 13314,
11487, 11768, 14650, 16247, 12741, 15075, 14823, 14160, 13284,
14798, 16300, 14803, 14049, NA, 13496, 16183, NA, 14206, 11675,
11226, 14630, 13035, 13767, 15667, 15201, NA, 12014, 12646, 15524,
16247, 16884, 15722, 14792, 16310, 16959, 16173, 16765, 15535,
NA, 16659, 14909, 16004, 15088, 15293, 15844, 14653, 16304, 15694,
15958, 14993, 15274, 15332, 16363, 14846, 16304, 16597, 15226,
11404, 15126, 12299, 14852, 15606, 15884, 16515, 15475, 15982,
13819, 15996, 16512, 14823, 16631, 14442, 16672, 14120, 16499,
15269, 13863, 16755, 15690, 15380, 13232, 13482, 13997, 16924,
13431, 14692, 14789, 12156, 15839, 16585, 15884, 13266, 16027,
13214, 14770, 13228, 15578, 13892, 15133, 12759, 13151, 14447,
14922, 14841, 14572, 15804, 15260, 14869, 11111, 15579, 13473,
15099, 15588, 11801, 13915, 14692, 12526, 14145, 13782, 15131,
14759, 13354, 12747, 13389, 13530, 15554, 15211, 11365, NA, NA,
12577, 15479, NA, 14759, NA, 16528, NA, 11559, NA, NA, NA, NA,
NA, NA), class = "Date"), mother_dob = structure(c(3751, 4152,
2012, 4033, 3220, 3547, 226, 4627, 936, 3971, 488, -751, NA,
-152, 7709, 6247, 628, 1515, 2679, 1220, 982, 3470, -480, 1034,
7168, 1783, 3356, 2352, 4166, 892, 1601, 1002, 3436, 3581, 2842,
NA, 7439, 1380, -476, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7939,
7973, NA, 7396, 8002, NA, 8499, 7879, 7512, 7512, 7495, 7919,
7503, 7412, 8069, 7823, 7321, 7399, 7686, 7682, 7504, 7738, 9053,
7579, 1088, 3346, 2276, 1738, 3566, 5392, 1106, 214, 2348, 1863,
NA, NA, NA, NA, 999, NA, 4484, 8885, 1958, 3197, 5721, 2815,
5304, 5451, 2145, 5354, 4758, 2517, 1557, 5354, 2145, 6377, 6011,
4084, -406, 332, 390, -716, 2218, 3982, 1251, 2842, 3158, 2008,
5332, 5872, 1643, 7342, 3350, 5199, 1065, -2651, 6334, NA, 6508,
2554, -835, 1502, 132, 1409, 4033, 3289, 1888, 2864, 1766, 629,
739, 3158, 1915, 778, 3781, NA, 5199, NA, 6508, 132, 4033, 3289,
1888, NA, NA, 3158, NA, 7075, NA, 2985, 431), class = "Date"),
father_dob = structure(c(3048, 4252, -485, 4363, 536, 565,
56, 1953, -534, 3535, -4109, -2086, NA, -321, 4858, 4287,
2169, -895, 2459, 873, 35, 3127, -1333, -73, 3866, 1573,
3192, 1424, -3378, -182, -139, 443, 3353, 5299, 97, NA, 5978,
1369, -3093, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7606,
NA, 6055, 7062, NA, 7893, 8325, 7297, 7297, 6510, 7849, 6883,
7503, 7536, 7356, 6338, NA, 7600, 7908, 7339, 5150, 7532,
7128, 511, 454, 2154, 2348, 2703, 4071, 1491, -1679, 3894,
-1475, NA, NA, NA, NA, -179, NA, 2941, 6677, 1067, -1206,
4403, 981, 3626, 437, 829, 2087, 2980, 2364, 653, 2087, 829,
6281, 6839, 4344, -3220, 3175, 335, 410, 1961, 2730, -951,
2670, 2938, -3676, 1977, 5758, 1154, 7582, 3564, 3297, 4148,
-2770, 5088, NA, 7605, 4096, -1145, 948, 1022, 560, 3016,
-1902, 2022, -703, 40, 446, 91, 1807, 1433, 4119, NA, NA,
3297, NA, 7605, 1022, 3016, -1902, 2022, NA, NA, 1807, NA,
5903, NA, 2521, 849), class = "Date"), MG_FID = c(1L, 2L,
0L, 3L, 4L, 5L, 7L, 8L, 0L, 9L, 10L, 0L, 0L, 11L, 0L, 0L,
0L, 12L, 13L, 14L, 0L, 0L, 0L, 15L, 16L, 17L, 18L, 19L, 20L,
0L, 21L, 22L, 23L, 24L, 0L, 25L, 26L, 27L, 0L, 1L, 2L, 3L,
4L, 5L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 18L,
19L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 28L, 0L, 29L, 30L, 0L, 31L, 0L,
28L, 29L, 30L, 31L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 32L,
35L, 33L, 0L, 0L, 34L, 33L, 35L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 36L, 37L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 38L, 0L,
39L, 0L, 0L, 40L, 41L, 0L, 0L, 42L, 0L, 43L, 44L, 45L, 0L,
46L, 47L, 48L, 49L, 50L, 0L, 51L, 37L, 38L, 39L, 40L, 42L,
43L, 44L, 45L, 47L, 48L, 49L, 50L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-174L))
mg_fid
I tried the code below, but it turns all my dates into random numbers.
parent_age %>% group_by(MG_FID) %>%
mutate(father_dob = ifelse(is.na(father_dob),max(father_dob,na.rm=TRUE),father_dob),
mother_dob = ifelse(is.na(mother_dob),max(mother_dob,na.rm=TRUE),mother_dob))
Does anyone have any ideas? Let me know if what I wrote doesn't make sense.
CodePudding user response:
# ensure mg_fid is tibble
mg_fid <- mg_fid %>% as_tibble(mg_fid)
# mutate columns ending in "r_dob", exclude the rows where MG_FID==0
mg_fid_new <- bind_rows(
mg_fid %>% filter(MG_FID==0),
mg_fid %>% filter(MG_FID!=0) %>%
group_by(MG_FID) %>%
mutate(across(.cols = ends_with("r_dob"),~mean(.x, na.rm=T)))
)
Input: as above updated by OP
Output:
# A tibble: 174 x 5
IID child_dob mother_dob father_dob MG_FID
<int> <date> <date> <date> <int>
1 1101509 2011-05-18 1975-07-06 1968-09-03 0
2 1101637 2005-03-29 1972-07-25 1968-07-16 0
3 1101647 2008-04-15 1967-12-12 1964-04-16 0
4 1101650 2007-07-24 NA NA 0
5 1101654 2013-07-03 1991-02-09 1983-04-21 0
6 1101655 2014-05-26 1987-02-08 1981-09-27 0
7 1101656 2006-06-10 1971-09-21 1975-12-10 0
8 1101662 2004-11-18 1972-09-09 1970-02-05 0
9 1101674 2010-12-13 1979-07-03 1978-07-25 0
10 1101675 2001-09-16 1968-09-08 1966-05-09 0
# ... with 164 more rows
The number of NA's has been reduced in the mother and father dob columns
> sapply(mg_fid, function(x) sum(is.na(x)))
IID child_dob mother_dob father_dob MG_FID
0 15 40 43 0
> sapply(mg_fid_new, function(x) sum(is.na(x)))
IID child_dob mother_dob father_dob MG_FID
0 15 8 11 0
CodePudding user response:
The NA values can be replaced by using a combination of group_by
and fill
, like so:
library(tidyverse)
library(janitor)
#>
#> Attaching package: 'janitor'
#> The following objects are masked from 'package:stats':
#>
#> chisq.test, fisher.test
data<-structure(list(IID = c(1101480L, 1101481L, 1101509L, 1101554L,
1101600L, 1101619L, 1101623L, 1101625L, 1101637L, 1101639L, 1101644L,
1101647L, 1101650L, 1101652L, 1101654L, 1101655L, 1101656L, 1101659L,
1101660L, 1101661L, 1101662L, 1101674L, 1101675L, 1101678L, 1101679L,
1101681L, 1101685L, 1101692L, 1101693L, 1101694L, 1101696L, 1101701L,
1101705L, 1101709L, 1101712L, 1101715L, 1101717L, 1101718L, 1101720L,
1131480L, 1131481L, 1131554L, 1131600L, 1131619L, 1131623L, 1131625L,
1131639L, 1131644L, 1131652L, 1131659L, 1131660L, 1131661L, 1131678L,
1131679L, 1131685L, 1131692L, 1131696L, 1131701L, 1131705L, 1131709L,
1131715L, 1131717L, 1131718L, 1193598L, 1193602L, 1193607L, 1193609L,
1193612L, 1193617L, 1193618L, 1193620L, 1193621L, 1193622L, 1193634L,
1193638L, 1193639L, 1193640L, 1193642L, 1193653L, 1193656L, 1193659L,
1193660L, 1193661L, 1193664L, 1193667L, 1193668L, 1193689L, 2202638L,
2202647L, 2202652L, 2202655L, 2202660L, 2202661L, 2202665L, 2202673L,
2202686L, 2202693L, 2232655L, 2232661L, 2232665L, 2232686L, 2296211L,
2296214L, 2296217L, 3305120L, 3305129L, 3305135L, 3305139L, 3305141L,
3305144L, 3305151L, 3305153L, 3305154L, 3305161L, 3305164L, 3305168L,
3335154L, 3345153L, 3392379L, 3392380L, 3392385L, 3392389L, 3392390L,
3392391L, 3392392L, 3392395L, 3392400L, 4406324L, 4406326L, 4406327L,
4406339L, 4406341L, 4406343L, 4406349L, 4406352L, 4406356L, 4406366L,
4406370L, 4406382L, 4406391L, 4406395L, 4406396L, 4406398L, 4406403L,
4406409L, 4406416L, 4406421L, 4406422L, 4406425L, 4406436L, 4406438L,
4406460L, 4406467L, 4406469L, 4406480L, 4406483L, 4406484L, 4406485L,
4436326L, 4436366L, 4436382L, 4436396L, 4436416L, 4436422L, 4436425L,
4436436L, 4436467L, 4436469L, 4436480L, 4436483L, 4494908L, 4494910L,
4494916L, 4494919L), child_dob = structure(c(15454, 15405, 15112,
14371, 14789, 15154, 12628, 15548, 12871, 16199, 14206, 13984,
13718, 14602, 15889, 16216, 13309, 15490, 13870, 12242, 12740,
14956, 11581, 14433, 16346, 14208, 16288, 12957, 15932, 13314,
11487, 11768, 14650, 16247, 12741, 15075, 14823, 14160, 13284,
14798, 16300, 14803, 14049, NA, 13496, 16183, NA, 14206, 11675,
11226, 14630, 13035, 13767, 15667, 15201, NA, 12014, 12646, 15524,
16247, 16884, 15722, 14792, 16310, 16959, 16173, 16765, 15535,
NA, 16659, 14909, 16004, 15088, 15293, 15844, 14653, 16304, 15694,
15958, 14993, 15274, 15332, 16363, 14846, 16304, 16597, 15226,
11404, 15126, 12299, 14852, 15606, 15884, 16515, 15475, 15982,
13819, 15996, 16512, 14823, 16631, 14442, 16672, 14120, 16499,
15269, 13863, 16755, 15690, 15380, 13232, 13482, 13997, 16924,
13431, 14692, 14789, 12156, 15839, 16585, 15884, 13266, 16027,
13214, 14770, 13228, 15578, 13892, 15133, 12759, 13151, 14447,
14922, 14841, 14572, 15804, 15260, 14869, 11111, 15579, 13473,
15099, 15588, 11801, 13915, 14692, 12526, 14145, 13782, 15131,
14759, 13354, 12747, 13389, 13530, 15554, 15211, 11365, NA, NA,
12577, 15479, NA, 14759, NA, 16528, NA, 11559, NA, NA, NA, NA,
NA, NA), class = "Date"), mother_dob = structure(c(3751, 4152,
2012, 4033, 3220, 3547, 226, 4627, 936, 3971, 488, -751, NA,
-152, 7709, 6247, 628, 1515, 2679, 1220, 982, 3470, -480, 1034,
7168, 1783, 3356, 2352, 4166, 892, 1601, 1002, 3436, 3581, 2842,
NA, 7439, 1380, -476, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7939,
7973, NA, 7396, 8002, NA, 8499, 7879, 7512, 7512, 7495, 7919,
7503, 7412, 8069, 7823, 7321, 7399, 7686, 7682, 7504, 7738, 9053,
7579, 1088, 3346, 2276, 1738, 3566, 5392, 1106, 214, 2348, 1863,
NA, NA, NA, NA, 999, NA, 4484, 8885, 1958, 3197, 5721, 2815,
5304, 5451, 2145, 5354, 4758, 2517, 1557, 5354, 2145, 6377, 6011,
4084, -406, 332, 390, -716, 2218, 3982, 1251, 2842, 3158, 2008,
5332, 5872, 1643, 7342, 3350, 5199, 1065, -2651, 6334, NA, 6508,
2554, -835, 1502, 132, 1409, 4033, 3289, 1888, 2864, 1766, 629,
739, 3158, 1915, 778, 3781, NA, 5199, NA, 6508, 132, 4033, 3289,
1888, NA, NA, 3158, NA, 7075, NA, 2985, 431), class = "Date"),
father_dob = structure(c(3048, 4252, -485, 4363, 536, 565,
56, 1953, -534, 3535, -4109, -2086, NA, -321, 4858, 4287,
2169, -895, 2459, 873, 35, 3127, -1333, -73, 3866, 1573,
3192, 1424, -3378, -182, -139, 443, 3353, 5299, 97, NA, 5978,
1369, -3093, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 7606,
NA, 6055, 7062, NA, 7893, 8325, 7297, 7297, 6510, 7849, 6883,
7503, 7536, 7356, 6338, NA, 7600, 7908, 7339, 5150, 7532,
7128, 511, 454, 2154, 2348, 2703, 4071, 1491, -1679, 3894,
-1475, NA, NA, NA, NA, -179, NA, 2941, 6677, 1067, -1206,
4403, 981, 3626, 437, 829, 2087, 2980, 2364, 653, 2087, 829,
6281, 6839, 4344, -3220, 3175, 335, 410, 1961, 2730, -951,
2670, 2938, -3676, 1977, 5758, 1154, 7582, 3564, 3297, 4148,
-2770, 5088, NA, 7605, 4096, -1145, 948, 1022, 560, 3016,
-1902, 2022, -703, 40, 446, 91, 1807, 1433, 4119, NA, NA,
3297, NA, 7605, 1022, 3016, -1902, 2022, NA, NA, 1807, NA,
5903, NA, 2521, 849), class = "Date"), MG_FID = c(1L, 2L,
0L, 3L, 4L, 5L, 7L, 8L, 0L, 9L, 10L, 0L, 0L, 11L, 0L, 0L,
0L, 12L, 13L, 14L, 0L, 0L, 0L, 15L, 16L, 17L, 18L, 19L, 20L,
0L, 21L, 22L, 23L, 24L, 0L, 25L, 26L, 27L, 0L, 1L, 2L, 3L,
4L, 5L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 18L,
19L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 28L, 0L, 29L, 30L, 0L, 31L, 0L,
28L, 29L, 30L, 31L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 32L,
35L, 33L, 0L, 0L, 34L, 33L, 35L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 36L, 37L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 38L, 0L,
39L, 0L, 0L, 40L, 41L, 0L, 0L, 42L, 0L, 43L, 44L, 45L, 0L,
46L, 47L, 48L, 49L, 50L, 0L, 51L, 37L, 38L, 39L, 40L, 42L,
43L, 44L, 45L, 47L, 48L, 49L, 50L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-174L))
data <- data %>%
as_tibble() %>%
clean_names()
data %>%
group_by(mg_fid) %>%
fill(mother_dob:father_dob, .direction = "downup") %>%
ungroup()
#> # A tibble: 174 x 5
#> iid child_dob mother_dob father_dob mg_fid
#> <int> <date> <date> <date> <int>
#> 1 1101480 2012-04-24 1980-04-09 1978-05-07 1
#> 2 1101481 2012-03-06 1981-05-15 1981-08-23 2
#> 3 1101509 2011-05-18 1975-07-06 1968-09-03 0
#> 4 1101554 2009-05-07 1981-01-16 1981-12-12 3
#> 5 1101600 2010-06-29 1978-10-26 1971-06-21 4
#> 6 1101619 2011-06-29 1979-09-18 1971-07-20 5
#> 7 1101623 2004-07-29 1970-08-15 1970-02-26 7
#> 8 1101625 2012-07-27 1982-09-02 1975-05-08 8
#> 9 1101637 2005-03-29 1972-07-25 1968-07-16 0
#> 10 1101639 2014-05-09 1980-11-15 1979-09-06 9
#> # ... with 164 more rows
Created on 2022-02-24 by the reprex package (v2.0.1)
One question though - the mg_fid == 0
group has 82 entries, where 6 mother_dob are missing and 8 father_dob are missing. How are you planning to replace those NAs?
data %>%
count(mg_fid, sort = TRUE)
# A tibble: 51 x 2
mg_fid n
<int> <int>
1 0 82
2 1 2
3 2 2
4 3 2
5 4 2
6 5 2
7 7 2
8 8 2
9 9 2
10 10 2
# ... with 41 more rows
data %>%
filter(mg_fid == 0) %>%
map_df(~ sum(is.na(.)))
# A tibble: 1 x 5
iid child_dob mother_dob father_dob mg_fid
<int> <int> <int> <int> <int>
1 0 5 6 8 0