I have a data.table test_dt
, in which some of the values are missing for each group
. Can someone suggest the fastest way to interpolate and fill the missing value?
In the graph form x-axis
is column a
and y-axis
is column b
, hench the interval distance of column a
should be considered while predicting b
.
test_dt = structure(list(group = c("B1", "B1", "B1", "B1", "B1", "B1", "B1", "B1", "C1", "C1", "C1", "C1", "C1", "C1", "C1", "C1"), a = c(165, 170, 185, 195, 200, 210, 220, 240, 1, 1.5, 2, 4.5, 5, 5.5, 7, 10), b = c(1.925, 0.575, 0.3, NA, NA, 2.825, 9.05, 27.9, 3.775, 3.225, 2.75, 0.255, 0.04, NA, NA, NA)), row.names = c(NA, -16L), class = c("data.table", "data.frame"), index = integer(0))
> test_dt
group a b
1: B1 165.0 1.925
2: B1 170.0 0.575
3: B1 185.0 0.300
4: B1 195.0 NA
5: B1 200.0 NA
6: B1 210.0 2.825
7: B1 220.0 9.050
8: B1 240.0 27.900
9: C1 1.0 3.775
10: C1 1.5 3.225
11: C1 2.0 2.750
12: C1 4.5 0.255
13: C1 5.0 0.040
14: C1 5.5 NA
15: C1 7.0 NA
16: C1 10.0 NA
Following are some conditions -
- Interpolated values should not be zero or negative value
- Interpolated values should not be a copy of the last non-NA value
I tried to solve it using na.spline
but the result is not correct, especially for group C1
, where value of column a
is 7 and 10
test_dt[, predicted := zoo::na.spline(zoo(.SD), x = a)[, 2], by = c("group")]
> test_dt
group a b predicted
1: B1 165.0 1.925 1.92500000
2: B1 170.0 0.575 0.57500000
3: B1 185.0 0.300 0.30000000
4: B1 195.0 NA 0.18981191
5: B1 200.0 NA 0.40167712
6: B1 210.0 2.825 2.82500000
7: B1 220.0 9.050 9.05000000
8: B1 240.0 27.900 27.90000000
9: C1 1.0 3.775 3.77500000
10: C1 1.5 3.225 3.22500000
11: C1 2.0 2.750 2.75000000
12: C1 4.5 0.255 0.25500000
13: C1 5.0 0.040 0.04000000
14: C1 5.5 NA 0.03038963
15: C1 7.0 NA 1.67389631
16: C1 10.0 NA 16.44642968
Update - 1 Following is the second dataset. Using the solutions below, the predicted values are very high.
temp_dt = structure(list(group = c("K1", "K1", "K1", "K1", "K1", "K1",
"K1", "K1", "K1", "K1", "K1", "K1", "K1", "F1", "F1", "F1", "F1",
"F1", "F1"), a = c(185, 190, 195, 200, 202.5, 205, 210, 212.5,
215, 217.5, 220, 222.5, 225, 22, 23, 24, 25, 26, 35), b = c(NA,
NA, 0.45, 0.6, 1.05, 1.45, 2.7, 3.125, 4.3, NA, 7.05, 8.95, 10.35,
0.5, 0.7, 0.925, 1.2, 1.75, NA), predicted = c(463741.812236,
14.7721571, 0.45, 0.6, 1.05, 1.45, 2.7, 3.125, 4.3, 5.6346081,
7.05, 8.95, 10.35, 0.5, 0.7, 0.925, 1.2, 1.75, 65226323253857.9
)), row.names = c(NA, -19L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x563cccc504c0>, index = integer(0))
test_dt[, predicted := round(exp(na.spline(log(b), x = a, na.rm = FALSE)), 7), by = group]
group a b predicted
1: K1 185.0 NA 4.637418e 05
2: K1 190.0 NA 1.477216e 01
3: K1 195.0 0.450 4.500000e-01
4: K1 200.0 0.600 6.000000e-01
5: K1 202.5 1.050 1.050000e 00
6: K1 205.0 1.450 1.450000e 00
7: K1 210.0 2.700 2.700000e 00
8: K1 212.5 3.125 3.125000e 00
9: K1 215.0 4.300 4.300000e 00
10: K1 217.5 NA 5.634608e 00
11: K1 220.0 7.050 7.050000e 00
12: K1 222.5 8.950 8.950000e 00
13: K1 225.0 10.350 1.035000e 01
14: F1 22.0 0.500 5.000000e-01
15: F1 23.0 0.700 7.000000e-01
16: F1 24.0 0.925 9.250000e-01
17: F1 25.0 1.200 1.200000e 00
18: F1 26.0 1.750 1.750000e 00
19: F1 35.0 NA 6.522632e 13
CodePudding user response:
There are infinite ways to do this, each way having a certain set of assumptions. Here is one way, transforming the data into the log scale, performing linear interpolation, and then exponentiating the result. This should approach 0 but not be exactly 0.
library(zoo)
round(exp(na.spline(log(test_dt$b),na.rm=F,method="natural")),5)
[1] 1.92500 0.57500 0.30000 0.39398 0.96937 2.82500 9.05000 27.90000 3.77500
[10] 3.22500 2.75000 0.25500 0.04000 0.00835 0.00175 0.00036
Also you should perform this separately for each group, so that there is no interpolation between groups.
CodePudding user response:
Ultimately, you have to decide the interpolation procedure based on scientific background. However, in order to avoid producing negative values, the log-transformation is useful. In the following, I combine that with a spline interpolation.
library(data.table)
test_dt = data.table(group = c("B1", "B1", "B1", "B1", "B1", "B1",
"B1", "B1", "C1", "C1", "C1", "C1", "C1", "C1", "C1", "C1"),
a = c(165, 170, 185, 195, 200, 210, 220, 240, 1, 1.5, 2, 4.5, 5, 5.5, 7, 10),
b = c(1.925, 0.575, 0.3, NA, NA, 2.825, 9.05, 27.9, 3.775, 3.225, 2.75, 0.255,
0.04, NA, NA, NA))
library(zoo)
test_dt[, c := exp(na.spline(log(b), x = a, na.rm = FALSE)), by = group]
library(ggplot2)
ggplot(test_dt, aes(x = a, color = group))
geom_line(aes(y = c))
geom_point(aes(y = c, color = "interpolated"))
geom_point(aes(y = b))
CodePudding user response:
Try different methods of spline.
Also here I catch values lower than 0.0001
with pmax
.
met <- c("fmm", "natural", "monoH.FC")
cbind(test_dt, lapply(setNames(met, met), function(met) {
ave(seq_len(nrow(test_dt)), test_dt$group, FUN=function(i) {
pmax(0.0001, splinefun(test_dt$a[i], test_dt$b[i], met)(test_dt$a[i]))
}) }) )
# group a b fmm natural monoH.FC
#1 B1 165.0 1.925 1.92500000 1.9250000 1.9250000
#2 B1 170.0 0.575 0.57500000 0.5750000 0.5750000
#3 B1 185.0 0.300 0.30000000 0.3000000 0.3000000
#4 B1 195.0 NA 0.18981191 0.2372716 0.3751691
#5 B1 200.0 NA 0.40167712 0.4448177 0.6702794
#6 B1 210.0 2.825 2.82500000 2.8250000 2.8250000
#7 B1 220.0 9.050 9.05000000 9.0500000 9.0500000
#8 B1 240.0 27.900 27.90000000 27.9000000 27.9000000
#9 C1 1.0 3.775 3.77500000 3.7750000 3.7750000
#10 C1 1.5 3.225 3.22500000 3.2250000 3.2250000
#11 C1 2.0 2.750 2.75000000 2.7500000 2.7500000
#12 C1 4.5 0.255 0.25500000 0.2550000 0.2550000
#13 C1 5.0 0.040 0.04000000 0.0400000 0.0400000
#14 C1 5.5 NA 0.03038963 0.0001000 0.0001000
#15 C1 7.0 NA 1.67389631 0.0001000 0.0001000
#16 C1 10.0 NA 16.44642968 0.0001000 0.0001000