Interpolate row-wise values of 0 between two columns with values >0 in R-CodePudding

I try to interpolate values of 0 between two values unequal to zero row-wise for the columns: 2018 to 2021 of a data.table in R. This is how a sample data df1 would look like:

   ID string1 2018 2019 2020 2021 string2
1: a1      x2    3    3    0    4      si
2: a2      g3    5    5    4    0      q2
3: a3      n2   11    0    0    3      oq
4: a4      m3    3    0    9    8      mx
5: a5      2w    9    1    6    5      ix
6: a6     ps2    2    4    7    4      p2
7: a7     kg2    6    0    9    6      2q

For convenient reproducibility:

df1 = data.table(
  ID = c("a1", "a2", "a3", "a4", "a5", "a6", "a7"),
  "string1" = c("x2", "g3", "n2", "m3", "2w", "ps2", "kg2"),
  "2018" = c(3,5,11,3,9,2,6),
  "2019" = c(3,5,0,0,1,4,0),
  "2020" = c(0,4,0,9,6,7,9),
  "2021" = c(4,0,3,8,5,4,6),
  "string2" = c("si", "q2", "oq", "mx", "ix", "p2", "2q"))

In df1 there are cases with a zero between two numbers >0 (for example; row 1/column 2020, row 4/column 2019 or row 7 column 2019). I try to identify these cases and interpolate them with the neighbour columns (for example; row 1/column 2020: 3 4 =3.5).

Is there a way to deal with that? So far, I only found a method to replace all the zero values, but without the condition of being between two numbers >0.

I try to get such an output:

   ID string1 2018 2019 2020 2021 string2
1: a1      x2    3  3.0  3.5    4      si
2: a2      g3    5  5.0  4.0    0      q2
3: a3      n2   11  0.0  0.0    3      oq
4: a4      m3    3  6.0  9.0    8      mx
5: a5      2w    9  1.0  6.0    5      ix
6: a6     ps2    2  4.0  7.0    4      p2
7: a7     kg2    6  7.5  9.0    6      2q

Thank you very much!

CodePudding user response：

Function to interpolate zeros between two positive elements:

f <- function(vec){
  
  prev_val <- shift(vec, 1, fill = 0)
  next_val <- shift(vec, -1, fill = 0)
  
  fifelse(prev_val > 0 & next_val > 0 & vec == 0, (prev_val   next_val) / 2, vec)
}

Applying function to all rows for year columns:

year_cols <- names(df1)[grep("^[0-9] $", names(df1))]
df1[, (year_cols) := transpose(lapply(transpose(.SD), f)), .SDcols = year_cols]

transpose is used because you want to do change on rows. Second use is to return it into column format.

CodePudding user response：

Maybe it is an overkill, but here is a solution using reshaping twice:

melt(df1, measure.vars = patterns("^[0-9] $")
     )[,value := fifelse(value == 0 &
                           shift(value, type = "lag", fill = 0) > 0 &
                           shift(value, type = "lead", fill = 0) > 0,
                         (shift(value, type = "lag")   shift(value, type = "lead")) / 2,
                         value), by = ID
       ][, dcast(.SD, ...~variable) ]

#    ID string1 string2 2018 2019 2020 2021
# 1: a1      x2      si    3  3.0  3.5    4
# 2: a2      g3      q2    5  5.0  4.0    0
# 3: a3      n2      oq   11  0.0  0.0    3
# 4: a4      m3      mx    3  6.0  9.0    8
# 5: a5      2w      ix    9  1.0  6.0    5
# 6: a6     ps2      p2    2  4.0  7.0    4
# 7: a7     kg2      2q    6  7.5  9.0    6

Edit: To fill in all NAs we can use zoo::na.approx or zoo::na.spline

cols <- grep("^[0-9] $", names(df1), value = TRUE)

df1[, (cols) := transpose(lapply(transpose(.SD), function(i) zoo::na.approx(
  ifelse(i == 0, NA, i), na.rm = FALSE))),
  .SDcols = cols ]
# Using na.approx, notice 2nd row for 2021 is NA.
#    ID string1 2018     2019     2020 2021 string2
# 1: a1      x2    3 3.000000 3.500000    4      si
# 2: a2      g3    5 5.000000 4.000000   NA      q2
# 3: a3      n2   11 8.333333 5.666667    3      oq
# 4: a4      m3    3 6.000000 9.000000    8      mx
# 5: a5      2w    9 1.000000 6.000000    5      ix
# 6: a6     ps2    2 4.000000 7.000000    4      p2
# 7: a7     kg2    6 7.500000 9.000000    6      2q

# Using na.spline
#    ID string1 2018     2019     2020 2021 string2
# 1: a1      x2    3 3.000000 3.333333    4      si
# 2: a2      g3    5 5.000000 4.000000    2      q2
# 3: a3      n2   11 8.333333 5.666667    3      oq
# 4: a4      m3    3 7.333333 9.000000    8      mx
# 5: a5      2w    9 1.000000 6.000000    5      ix
# 6: a6     ps2    2 4.000000 7.000000    4      p2
# 7: a7     kg2    6 9.000000 9.000000    6      2q

CodePudding user response：

Using data.table functions (and the original data.frame), this code (a bit cumbersome) should work:

for (i in c(2019,2020)){
  x = which(colnames(df1) == i)
  df1[,x] <- ifelse(c(df1[,.SD,.SDcols = x] == 0 & df1[,.SD,.SDcols = c(x-1)] > 0 & df1[,.SD,.SDcols = c(x 1)] > 0), 
                    rowMeans(df1[,.SD,.SDcols = c(x-1,x 1)]), unlist(df1[,.SD,.SDcols = x]))
}

> df1
   ID string1 2018 2019 2020 2021 string2
1: a1      x2    3  3.0  3.5    4      si
2: a2      g3    5  5.0  4.0    0      q2
3: a3      n2   11  0.0  0.0    3      oq
4: a4      m3    3  6.0  9.0    8      mx
5: a5      2w    9  1.0  6.0    5      ix
6: a6     ps2    2  4.0  7.0    4      p2
7: a7     kg2    6  7.5  9.0    6      2q

And here is a base R solution (using data.frame instead of data.table to generate the data):

for (i in c("X2019","X2020")){
  x = which(colnames(df1) == i)
  df1[,x] <- ifelse(df1[,x] == 0 & df1[,x-1] > 0 & df1[,x 1] > 0, rowMeans(df1[,c(x-1,x 1)]), df1[,x])
}