Home > front end >  Calculating average of preceeding and following values for NAs in a column in R
Calculating average of preceeding and following values for NAs in a column in R

Time:12-07

I have a data set that has measured values in some samples and leaves NAs in others. In between each measured value is 10 samples with no value. I want the value of those ten samples to each be the average of the proceeding and following measured values. The data looks like this:

data

Importantly, the values for each of the ten NAs between any two known values has to be the same. I previously tried using zoo::na.approx(RC, rule=2), but that calculates averages using its own interpolated data, not just the two known values. The output should look like this:

enter image description here

Edit as per deschen's request:

dput(rc_report[,c(2,26)]) structure(list(sample_id = c("REFTTO_IS_211201_1_b", "ARL2108200_b", "ARL2108201_b", "ARL2108202_b", "ARL2108203_b", "ARL2108204_b", "ARL2108205_b", "ARL2108206_b", "ARL2108207_b", "ARL2108208_b", "ARL2108209_b", "REFTTO_IS_211201_2", "ARL2108210_b", "ARL2108211_b", "ARL2108212_b", "ARL2108213_b", "ARL2108214_b", "ARL2108215_b", "ARL2108216_b", "ARL2108217_b", "ARL2108218_b", "ARL2108219_b", "REFTTO_IS_211201_3", "REFTTO_IS_211203_1", "ARL2108220", "ARL2108221", "ARL2108222", "ARL2108223", "ARL2108224", "ARL2108225", "ARL2108226", "ARL2108227", "ARL2108228", "ARL2108229", "REFTTO_IS_211203_2", "ARL2108230", "ARL2108231", "ARL2108232", "ARL2108233", "ARL2108234", "ARL2108235", "ARL2108236", "ARL2108237", "ARL2108238", "ARL2108239", "REFTTO_IS_211203_3", "REFTTO_IS_211206_1", "ARL2108240", "ARL2108241", "ARL2108242", "ARL2108243", "ARL2108244", "ARL2108245", "ARL2108246", "ARL2108247", "ARL2108248", "ARL2108249", "REFTTO_IS_211206_2", "ARL2108250", "ARL2108251", "ARL2108252", "ARL2108253", "ARL2108254", "ARL2108255", "ARL2108256", "ARL2108258", "ARL2108259", "REFTTO_IS_211206_3" ), response_coefficient = c("1.09785865302384", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "1.09822862814289", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "1.09835314677401", "1.09942926690693", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "1.10084276861106", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "1.10078178211056", "1.11104600880183", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "1.11203467893562", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "1.11344223852824" )), class = "data.frame", row.names = c(NA, -68L))

CodePudding user response:

I solved this problem using the following code. It's probably pretty inefficient, but it does the job.

enter image description here

CodePudding user response:

Using the data shown in the Note at the end (we changed the column names to shorter ones) use na.locf0 both forwards and backwards taking the average.

library(dplyr)
library(zoo)

dd %>%
  mutate(coef = as.numeric(ifelse(coef == "NA", NA, coef)),
         coef = (na.locf0(coef)   na.locf0(coef, fromLast = TRUE)) / 2)

Note

dd <- structure(list(sample_id = c("REFTTO_IS_211201_1_b", "ARL2108200_b", "ARL2108201_b", "ARL2108202_b", "ARL2108203_b", "ARL2108204_b", "ARL2108205_b", "ARL2108206_b", "ARL2108207_b", "ARL2108208_b", "ARL2108209_b", "REFTTO_IS_211201_2", "ARL2108210_b", "ARL2108211_b", "ARL2108212_b", "ARL2108213_b", "ARL2108214_b", "ARL2108215_b", "ARL2108216_b", "ARL2108217_b", "ARL2108218_b", "ARL2108219_b", "REFTTO_IS_211201_3", "REFTTO_IS_211203_1", "ARL2108220", "ARL2108221", "ARL2108222", "ARL2108223", "ARL2108224", "ARL2108225", "ARL2108226", "ARL2108227", "ARL2108228", "ARL2108229", "REFTTO_IS_211203_2", "ARL2108230", "ARL2108231", "ARL2108232", "ARL2108233", "ARL2108234", "ARL2108235", "ARL2108236", "ARL2108237", "ARL2108238", "ARL2108239", "REFTTO_IS_211203_3", "REFTTO_IS_211206_1", "ARL2108240", "ARL2108241", "ARL2108242", "ARL2108243", "ARL2108244", "ARL2108245", "ARL2108246", "ARL2108247", "ARL2108248", "ARL2108249", "REFTTO_IS_211206_2", "ARL2108250", "ARL2108251", "ARL2108252", "ARL2108253", 
"ARL2108254", "ARL2108255", "ARL2108256", "ARL2108258", "ARL2108259", "REFTTO_IS_211206_3" ), 
response_coefficient = c("1.09785865302384", "NA", "NA", "NA", "NA", "NA", 
"NA", "NA", "NA", "NA", "NA", "1.09822862814289", "NA", "NA", "NA", "NA", 
"NA", "NA", "NA", "NA", "NA", "NA", "1.09835314677401", "1.09942926690693", 
"NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", 
"1.10084276861106", "NA", "NA", "NA", "NA", 
"NA", "NA", "NA", "NA", "NA", "NA", "1.10078178211056", "1.11104600880183", 
"NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "1.11203467893562", 
"NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "1.11344223852824" )), 
class = "data.frame", row.names = c(NA, -68L))
names(dd) <- c("id", "coef")
  • Related