Improve the performance of if-else loop to update missing value-CodePudding

I have a massive dataset with millions of lines that I want to fill in the missing values with either the previous value, next value or average of previous and next value available value. It will check for previous three rows and next three rows for availability of non-missing value. Here missing value is represented by -99. I wrote ifelse loop for that but it's taking me hours to do this task. Can we accomplish it more efficiently and quickly?

rec_30 <- which(data$TEMPERATURE %in% c(-99) & data$MIN == 30)

for(q in rec_30){
    ifelse(q <= 4, aa <- data$TEMPERATURE[q],                                         
           ifelse(data$TEMPERATURE[q-1] > 0, aa <- data$TEMPERATURE[q-1],
                  ifelse(data$TEMPERATURE[q-2] > 0, aa <- data$TEMPERATURE[q-2],
                         ifelse(data$TEMPERATURE[q-3], aa <- data$TEMPERATURE[q-3], aa <- data$TEMPERATURE[q]))))
    
    ifelse(data$TEMPERATURE[q 1] > 0, bb <- data$TEMPERATURE[q 1],
           ifelse(data$TEMPERATURE[q 2] > 0, bb <- data$TEMPERATURE[q 2],
                  ifelse(data$TEMPERATURE[q 3] > 0, bb <- data$TEMPERATURE[q 3], bb <- data$TEMPERATURE[q])))
    
    ifelse(aa > 0 & bb > 0, data$TEMPERATURE[q] <- (aa   bb)/2,
           ifelse(aa > 0 & bb == -99, data$TEMPERATURE[q] <- aa,
                  ifelse(aa == -99 & bb > 0, data$TEMPERATURE[q] <- bb, data$TEMPERATURE[q] <- data$TEMPERATURE[q])))
}

CodePudding user response：

Here is a an option that leverages data.table::shift()

set your data to data.table, and change -99 to NA

library(data.table)
setDT(data)
data[TEMPERATURE==-99, TEMPERATURE:=NA]

Create columns for the previous three temp values and the next temp values and an id column for subsequent evaluation by row.


cols = c(paste0("next",1:3), paste0("prior",1:3))
data[,(cols):= shift(TEMPERATURE, c(-1:-3,1:3))]
data[, id:=.I]

use fcase by row to get the correct combination of next/prev for each row

data[is.na(TEMPERATURE), IMPUTED_TEMPERATURE:=fcase(
  any(next1,prior1), mean(c(next1,prior1), na.rm=T),
  any(next2,prior2), mean(c(next2,prior2), na.rm=T),
  any(next3,prior3), mean(c(next3,prior3), na.rm=T),
  default=NA), id][, (c(cols,"id")):=NULL]

Here is alternative approach that would replace steps 2 and 3 above. It uses rowMeans. Note that I call shift() three times here, each time for the set of /-1 row, /-2 rows, /-3 rows (but this can be further simplified if you want to take the mean of any and all values that are available in the 3 rows before and 3 rows after)

# Get the three sets of rowmeans
rmeans = cbind(
  rowMeans(data[, shift(TEMPERATURE, c(-1,1))], na.rm=T),
  rowMeans(data[, shift(TEMPERATURE, c(-2,2))], na.rm=T),
  rowMeans(data[, shift(TEMPERATURE, c(-3,3))], na.rm=T)
)
# Set imputed temperature to the first one that is not NA
data[, imputed:=apply(rmeans,1,\(x) x[which(!is.na(x))[1]])]

# Set imputed back to NA if it does  not need to be imputed
data[!is.na(TEMPERATURE), imputed:=NA]

Output (first 10 rows; same with either method)

    TEMPERATURE IMPUTED_TEMPERATURE
          <num>               <num>
 1:    21.13703                  NA
 2:          NA            23.61489
 3:    26.09275                  NA
 4:    26.23379                  NA
 5:    28.60915                  NA
 6:          NA            24.35206
 7:    20.09496                  NA
 8:    22.32551                  NA
 9:          NA            23.73401
10:    25.14251                  NA

Input:

data = structure(list(TEMPERATURE = c(21.1370341130532, -99, 26.0927473288029, 
26.2337944167666, 28.6091538355686, -99, 20.0949575635605, 22.3255050601438, 
-99, 25.1425114134327, -99, 25.4497483558953, 22.8273358359002, 
29.2343348427676, 22.9231584025547, 28.3729562815279, -99, 22.6682078000158, 
-99, -99, -99, 23.0269337072968, 21.590460028965, 20.3999591805041, 
22.1879954100586, 28.1059855245985, -99, -99, 28.3134504687041, 
-99, 24.5609148242511, -99, 23.0467220302671, 25.0730687007308, 
-99, -99, 22.0124803762883, 22.5880981865339, 29.9215041752905, 
28.0735234031454, -99, -99, -99, 26.2181919813156, 23.2977017574012, 
25.0199747295119, 26.7709452728741, -99, 22.4392882734537, 27.6545978756621, 
-99, 23.0968660186045, -99, 25.0454591214657, 21.5299895894714, 
25.0393348815851, -99, -99, 21.7464982392266, -99, -99, 20.4185727518052, 
-99, 20.137499391567, -99, -99, 23.0809475714341, 25.0854756566696, 
20.5164661933668, -99, -99, -99, 20.1462725573219, -99, 20.899613329675, 
-99, -99, 20.7005249732174, -99, -99, -99, 24.7190972114913, 
21.4261534321122, -99, -99, 28.9858048921451, 23.8949978468008, 
23.1087077967823, -99, 28.9618584956042, 21.6639378038235, 29.0042459615506, 
21.3407819508575, 21.3161413418129, 21.0528750251979, 25.1158358110115, 
23.0019905394875, -99, 23.0964743136428, 27.4211965710856)), row.names = c(NA, 
-100L), class = "data.frame")