I have a massive dataset with millions of lines that I want to fill in the missing values with either the previous value, next value or average of previous and next value available value. It will check for previous three rows and next three rows for availability of non-missing value. Here missing value is represented by -99. I wrote ifelse
loop for that but it's taking me hours to do this task. Can we accomplish it more efficiently and quickly?
rec_30 <- which(data$TEMPERATURE %in% c(-99) & data$MIN == 30)
for(q in rec_30){
ifelse(q <= 4, aa <- data$TEMPERATURE[q],
ifelse(data$TEMPERATURE[q-1] > 0, aa <- data$TEMPERATURE[q-1],
ifelse(data$TEMPERATURE[q-2] > 0, aa <- data$TEMPERATURE[q-2],
ifelse(data$TEMPERATURE[q-3], aa <- data$TEMPERATURE[q-3], aa <- data$TEMPERATURE[q]))))
ifelse(data$TEMPERATURE[q 1] > 0, bb <- data$TEMPERATURE[q 1],
ifelse(data$TEMPERATURE[q 2] > 0, bb <- data$TEMPERATURE[q 2],
ifelse(data$TEMPERATURE[q 3] > 0, bb <- data$TEMPERATURE[q 3], bb <- data$TEMPERATURE[q])))
ifelse(aa > 0 & bb > 0, data$TEMPERATURE[q] <- (aa bb)/2,
ifelse(aa > 0 & bb == -99, data$TEMPERATURE[q] <- aa,
ifelse(aa == -99 & bb > 0, data$TEMPERATURE[q] <- bb, data$TEMPERATURE[q] <- data$TEMPERATURE[q])))
}
CodePudding user response:
Here is a an option that leverages data.table::shift()
- set your data to data.table, and change -99 to NA
library(data.table)
setDT(data)
data[TEMPERATURE==-99, TEMPERATURE:=NA]
- Create columns for the previous three temp values and the next temp values and an
id
column for subsequent evaluation by row.
cols = c(paste0("next",1:3), paste0("prior",1:3))
data[,(cols):= shift(TEMPERATURE, c(-1:-3,1:3))]
data[, id:=.I]
- use
fcase
by row to get the correct combination of next/prev for each row
data[is.na(TEMPERATURE), IMPUTED_TEMPERATURE:=fcase(
any(next1,prior1), mean(c(next1,prior1), na.rm=T),
any(next2,prior2), mean(c(next2,prior2), na.rm=T),
any(next3,prior3), mean(c(next3,prior3), na.rm=T),
default=NA), id][, (c(cols,"id")):=NULL]
Here is alternative approach that would replace steps 2 and 3 above. It uses rowMeans
. Note that I call shift()
three times here, each time for the set of /-1 row, /-2 rows, /-3 rows (but this can be further simplified if you want to take the mean of any and all values that are available in the 3 rows before and 3 rows after)
# Get the three sets of rowmeans
rmeans = cbind(
rowMeans(data[, shift(TEMPERATURE, c(-1,1))], na.rm=T),
rowMeans(data[, shift(TEMPERATURE, c(-2,2))], na.rm=T),
rowMeans(data[, shift(TEMPERATURE, c(-3,3))], na.rm=T)
)
# Set imputed temperature to the first one that is not NA
data[, imputed:=apply(rmeans,1,\(x) x[which(!is.na(x))[1]])]
# Set imputed back to NA if it does not need to be imputed
data[!is.na(TEMPERATURE), imputed:=NA]
Output (first 10 rows; same with either method)
TEMPERATURE IMPUTED_TEMPERATURE
<num> <num>
1: 21.13703 NA
2: NA 23.61489
3: 26.09275 NA
4: 26.23379 NA
5: 28.60915 NA
6: NA 24.35206
7: 20.09496 NA
8: 22.32551 NA
9: NA 23.73401
10: 25.14251 NA
Input:
data = structure(list(TEMPERATURE = c(21.1370341130532, -99, 26.0927473288029,
26.2337944167666, 28.6091538355686, -99, 20.0949575635605, 22.3255050601438,
-99, 25.1425114134327, -99, 25.4497483558953, 22.8273358359002,
29.2343348427676, 22.9231584025547, 28.3729562815279, -99, 22.6682078000158,
-99, -99, -99, 23.0269337072968, 21.590460028965, 20.3999591805041,
22.1879954100586, 28.1059855245985, -99, -99, 28.3134504687041,
-99, 24.5609148242511, -99, 23.0467220302671, 25.0730687007308,
-99, -99, 22.0124803762883, 22.5880981865339, 29.9215041752905,
28.0735234031454, -99, -99, -99, 26.2181919813156, 23.2977017574012,
25.0199747295119, 26.7709452728741, -99, 22.4392882734537, 27.6545978756621,
-99, 23.0968660186045, -99, 25.0454591214657, 21.5299895894714,
25.0393348815851, -99, -99, 21.7464982392266, -99, -99, 20.4185727518052,
-99, 20.137499391567, -99, -99, 23.0809475714341, 25.0854756566696,
20.5164661933668, -99, -99, -99, 20.1462725573219, -99, 20.899613329675,
-99, -99, 20.7005249732174, -99, -99, -99, 24.7190972114913,
21.4261534321122, -99, -99, 28.9858048921451, 23.8949978468008,
23.1087077967823, -99, 28.9618584956042, 21.6639378038235, 29.0042459615506,
21.3407819508575, 21.3161413418129, 21.0528750251979, 25.1158358110115,
23.0019905394875, -99, 23.0964743136428, 27.4211965710856)), row.names = c(NA,
-100L), class = "data.frame")