I am trying to interpolate some values by group using zoo:na.approx
. The dataframe needs to start and end with non-NA values. Is there a way to remove them but keeping the "inner" NAs? I cannot use a filter based on other variables since the interpolation is performed by groups and the missing values vary among them.
Here an example of my code:
library(zoo)
library(lubridate)
library(dplyr)
set.seed(471)
db <- rep(seq(ymd("2021-12-20"), ymd("2021-12-30"), by = "days"),4) %>% merge(seq(1,4,1)) %>%
mutate(z=rnorm(176))
db$z[db$z<0] <- NA
db %>% group_by(y) %>% mutate(aa=na.approx(z))
CodePudding user response:
Add the rule=2
argument to na.approx
to extrapolate NA
s at the beginning and end of each group so that they are not NA
db %>%
group_by(y) %>%
mutate(aa=na.approx(z, rule = 2)) %>%
ungroup
or use na.trim
to remove the NA's at the beginning and end of each group.
db %>%
group_by(y) %>%
group_modify(~ na.trim(.)) %>%
mutate(aa = na.approx(z)) %>%
ungroup
CodePudding user response:
I'll focus on the first/last 3 rows of each group:
db %>%
group_by(y) %>%
slice(c(1:3, n() - 2:0)) %>%
print(n=99)
# # A tibble: 24 x 3
# # Groups: y [4]
# x y z
# <date> <dbl> <dbl>
# 1 2021-12-20 1 NA
# 2 2021-12-21 1 0.605
# 3 2021-12-22 1 0.185
# 4 2021-12-28 1 0.805
# 5 2021-12-29 1 NA
# 6 2021-12-30 1 NA
# 7 2021-12-20 2 NA
# 8 2021-12-21 2 0.402
# 9 2021-12-22 2 NA
# 10 2021-12-28 2 NA
# 11 2021-12-29 2 0.163
# 12 2021-12-30 2 0.796
# 13 2021-12-20 3 1.00
# 14 2021-12-21 3 NA
# 15 2021-12-22 3 0.733
# 16 2021-12-28 3 0.00858
# 17 2021-12-29 3 NA
# 18 2021-12-30 3 0.179
# 19 2021-12-20 4 NA
# 20 2021-12-21 4 0.298
# 21 2021-12-22 4 NA
# 22 2021-12-28 4 0.355
# 23 2021-12-29 4 2.42
# 24 2021-12-30 4 NA
Groups 1 and 4 starts/ends on an NA
, group 2 startd on an NA
.
Try this:
db %>%
group_by(y) %>%
filter(cumany(!is.na(z)) & rev(cumany(rev(!is.na(z))))) %>%
slice(c(1:3, n() - 2:0)) %>%
print(n=99)
# # A tibble: 24 x 3
# # Groups: y [4]
# x y z
# <date> <dbl> <dbl>
# 1 2021-12-21 1 0.605
# 2 2021-12-22 1 0.185
# 3 2021-12-23 1 NA
# 4 2021-12-26 1 0.871
# 5 2021-12-27 1 NA
# 6 2021-12-28 1 0.805
# 7 2021-12-21 2 0.402
# 8 2021-12-22 2 NA
# 9 2021-12-23 2 0.364
# 10 2021-12-28 2 NA
# 11 2021-12-29 2 0.163
# 12 2021-12-30 2 0.796
# 13 2021-12-20 3 1.00
# 14 2021-12-21 3 NA
# 15 2021-12-22 3 0.733
# 16 2021-12-28 3 0.00858
# 17 2021-12-29 3 NA
# 18 2021-12-30 3 0.179
# 19 2021-12-21 4 0.298
# 20 2021-12-22 4 NA
# 21 2021-12-23 4 0.660
# 22 2021-12-27 4 NA
# 23 2021-12-28 4 0.355
# 24 2021-12-29 4 2.42
CodePudding user response:
You can first perform an approximation, and then remove the NA
s:
db %>%
group_by(y) %>%
mutate(output = zoo::na.approx(z, na.rm = FALSE))
Output:
# A tibble: 176 x 4
# Groups: y [4]
x y z test
<date> <dbl> <dbl> <dbl>
1 2021-12-20 1 NA NA
2 2021-12-21 1 0.605 0.605
3 2021-12-22 1 0.185 0.185
4 2021-12-23 1 NA 0.455
5 2021-12-24 1 0.725 0.725
6 2021-12-25 1 1.51 1.51
7 2021-12-26 1 NA 1.41
8 2021-12-27 1 1.31 1.31
9 2021-12-28 1 1.07 1.07
10 2021-12-29 1 1.14 1.14
As you can, in part, see, the na.rm = FALSE
argument in na.approx
keeps the top and bot NA
for each group, while calculating an approximation within the groups. You can then filter the data to remove the NA
in the new created column:
db %>%
group_by(y) %>%
mutate(output = zoo::na.approx(z, na.rm = F)) %>%
ungroup() %>%
filter(!is.na(output))
CodePudding user response:
You could use imputeTS::na_kalman
, which also extrapolates.
r <- do.call(rbind, by(db, db$y, FUN=\(x) transform(x, aa=imputeTS::na_kalman(z))))
tail(r[r$y == 1, ])
# x y z aa
# 1.39 2021-12-25 1 0.020848035 0.020848035
# 1.40 2021-12-26 1 0.017171691 0.017171691
# 1.41 2021-12-27 1 0.007122718 0.007122718
# 1.42 2021-12-28 1 NA 0.392535303
# 1.43 2021-12-29 1 0.629796532 0.629796532
# 1.44 2021-12-30 1 NA 0.258814648
Data:
db <- structure(list(x = structure(c(18981, 18982, 18983, 18984, 18985,
18986, 18987, 18988, 18989, 18990, 18991, 18981, 18982, 18983,
18984, 18985, 18986, 18987, 18988, 18989, 18990, 18991, 18981,
18982, 18983, 18984, 18985, 18986, 18987, 18988, 18989, 18990,
18991, 18981, 18982, 18983, 18984, 18985, 18986, 18987, 18988,
18989, 18990, 18991, 18981, 18982, 18983, 18984, 18985, 18986,
18987, 18988, 18989, 18990, 18991, 18981, 18982, 18983, 18984,
18985, 18986, 18987, 18988, 18989, 18990, 18991, 18981, 18982,
18983, 18984, 18985, 18986, 18987, 18988, 18989, 18990, 18991,
18981, 18982, 18983, 18984, 18985, 18986, 18987, 18988, 18989,
18990, 18991, 18981, 18982, 18983, 18984, 18985, 18986, 18987,
18988, 18989, 18990, 18991, 18981, 18982, 18983, 18984, 18985,
18986, 18987, 18988, 18989, 18990, 18991, 18981, 18982, 18983,
18984, 18985, 18986, 18987, 18988, 18989, 18990, 18991, 18981,
18982, 18983, 18984, 18985, 18986, 18987, 18988, 18989, 18990,
18991, 18981, 18982, 18983, 18984, 18985, 18986, 18987, 18988,
18989, 18990, 18991, 18981, 18982, 18983, 18984, 18985, 18986,
18987, 18988, 18989, 18990, 18991, 18981, 18982, 18983, 18984,
18985, 18986, 18987, 18988, 18989, 18990, 18991, 18981, 18982,
18983, 18984, 18985, 18986, 18987, 18988, 18989, 18990, 18991
), class = "Date"), y = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4), z = c(0.305344789017667,
0.256644623614096, NA, 1.31852719135355, 0.115506505762677, 0.732802091953865,
NA, 0.239925107412262, 0.685318244939073, 0.691973256906341,
1.32378575746467, NA, 0.384693043255873, 1.45895509632899, NA,
0.0599714441492927, NA, NA, NA, NA, NA, 0.71683339822062, NA,
3.27310516365819, 1.69204573033578, NA, 0.14017486940184, NA,
1.16261380170504, NA, NA, NA, 1.68438289810619, NA, NA, 1.31386940315565,
0.594623922245712, NA, 0.0208480351055444, 0.0171716909393243,
0.00712271758331095, NA, 0.629796532479193, NA, 0.244580018794366,
NA, 0.820911116824006, NA, NA, 0.557088403848106, 0.0130780982496676,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1.28902764727033,
0.242390057597798, NA, 1.75609046517858, 0.921685169855448, 0.240269454747801,
NA, 0.133290865347424, 0.760944667549314, NA, 2.10865624982592,
0.201965354187563, NA, 0.372617511993437, 0.40925122336274, 0.598185767876918,
NA, NA, 1.51486434937749, NA, 0.365799492559624, 1.93980359376164,
NA, NA, NA, 1.39839171014837, NA, NA, 1.131273582479, 1.35134680218024,
NA, 1.02956577738351, 0.271873664141861, 0.777813782525466, NA,
NA, 0.286721974151372, 0.0305405702707527, NA, NA, 0.922064532313788,
NA, 0.211308210750866, NA, NA, 0.416086290075234, 0.744175318362445,
1.05570394997758, NA, 2.10096763825364, NA, NA, 0.945801512771798,
1.64923864766573, NA, 0.0338301608791077, 1.93867810865554, 0.611903344641826,
NA, NA, NA, 0.664664842786913, 0.992532329760494, 0.106067365628389,
NA, NA, 0.253237072580547, 1.39727781231248, 0.750659506338532,
NA, NA, 0.531677176826455, NA, 0.334496935245917, NA, 0.237217689673067,
NA, 0.729615340974382, 0.418007005399876, NA, NA, NA, 0.575142620388619,
2.27297683347494, NA, 1.0088509112411, NA, NA, NA, 1.07213691727514,
NA, 0.950964366873889, NA, NA, 1.37008596018781, NA, 0.581570283604887,
0.903895963902468, NA, 0.170520505104898, 0.664123540127705,
1.20066990898952, NA, 0.243496848502427, 0.679868588335254, NA,
2.09127742408436, 0.77948087799739, NA, 0.658167166169738, NA,
2.15919199233993, NA, 0.778191585042783)), row.names = c(NA,
-176L), class = "data.frame")
CodePudding user response:
Another possible solution:
library(zoo)
library(lubridate)
library(dplyr)
set.seed(471)
db <- rep(seq(ymd("2021-12-20"), ymd("2021-12-30"), by = "days"),4) %>% merge(seq(1,4,1)) %>%
mutate(z=rnorm(176))
db$z[db$z<0] <- NA
db %>%
group_by(y) %>%
mutate(aux = data.table::rleid(z)) %>%
filter(!((aux == 1 | aux == max(aux)) & is.na(z))) %>%
ungroup %>% select(-aux) %>% mutate(aa=na.approx(z))
#> # A tibble: 170 × 4
#> x y z aa
#> <date> <dbl> <dbl> <dbl>
#> 1 2021-12-21 1 0.605 0.605
#> 2 2021-12-22 1 0.185 0.185
#> 3 2021-12-23 1 NA 0.455
#> 4 2021-12-24 1 0.725 0.725
#> 5 2021-12-25 1 1.51 1.51
#> 6 2021-12-26 1 NA 1.41
#> 7 2021-12-27 1 1.31 1.31
#> 8 2021-12-28 1 1.07 1.07
#> 9 2021-12-29 1 1.14 1.14
#> 10 2021-12-30 1 NA 0.585
#> # … with 160 more rows