I need to calculate 2 week percent change for a dataset that may not have samples spaced exactly 14 days apart. This forloop gives me % change for days that are exactly 14 days apart, but can't handle the sampling frequency wobble. I.e. 2022-06-14 % change is NA because there was no sample 2022-05-31 but there is one 2022-05-30. I would like a % change based either on the value of 2022-05-30 or an imputation of 2022-05-31 based on 2022-05-30 and 2022-06-02.
library(dplyr)
library(tidyr)
library(lubridate)
dat.N1 <- structure(list(date = c("2022-04-27", "2022-04-29", "2022-05-02",
"2022-05-04", "2022-05-06", "2022-05-17", "2022-05-19", "2022-05-24",
"2022-05-26", "2022-05-30", "2022-06-02", "2022-06-07", "2022-06-09",
"2022-06-14", "2022-06-17", "2022-06-21", "2022-06-28", "2022-06-30",
"2022-07-05", "2022-07-07", "2022-07-12"), copies_liter = c(168649.864,
62449.256, 464682.88, 127620.624, 2110.27168, 20384.6968, 6817.724,
145.2679712, 0.3792992, 51.4470568, 0.01, 30094.404, 42225.784,
37688.632, 30730.0368, 8108.9016, 6142.6856, 7411.6464, 77131.912,
23668.7056, 11973.198)), row.names = 210:230, class = "data.frame")
dat.N1$date <- as.Date(dat.N1$date)
dat.N1$date_min2 <- dat.N1$date-14
dat.N1$prop <-1:21
for (i in 1:21){
copies_d_current <- dat.N1[i, "copies_liter"]
copies_d_past <- dat.N1[dat.N1[, "date"]==dat.N1[i, "date_min2"],
"copies_liter"]
dat.N1$prop[i] <- ifelse(length(copies_d_current/copies_d_past)==0,
NA,
copies_d_current/copies_d_past %>% as.numeric())
dat.N1$perc <- 100-dat.N1$prop*100
#print(i)
}
CodePudding user response:
I'm not sure what type of imputation you might want, but here is simple linear interpoloation that gives you the percent change 14 days prior.
dates = seq(min(dat.N1$date), max(dat.N1$date), by="day")
dat.N1 %>%
left_join(
data.frame(
date=dates, imp_14d_prior = approxfun(dat.N1$date,dat.N1$copies_liter)(dates)
), by=c("date_min2"="date")
) %>%
mutate(perc_ch = 100-(copies_liter/imp_14d_prior)*100)
Output:
date copies_liter date_min2 imp_14d_prior perc_ch
1 2022-04-27 1.686499e 05 2022-04-13 NA NA
2 2022-04-29 6.244926e 04 2022-04-15 NA NA
3 2022-05-02 4.646829e 05 2022-04-18 NA NA
4 2022-05-04 1.276206e 05 2022-04-20 NA NA
5 2022-05-06 2.110272e 03 2022-04-22 NA NA
6 2022-05-17 2.038470e 04 2022-05-03 2.961518e 05 9.311681e 01
7 2022-05-19 6.817724e 03 2022-05-05 6.486545e 04 8.948944e 01
8 2022-05-24 1.452680e 02 2022-05-10 8.755517e 03 9.834084e 01
9 2022-05-26 3.792992e-01 2022-05-12 1.207814e 04 9.999686e 01
10 2022-05-30 5.144706e 01 2022-05-16 1.872339e 04 9.972523e 01
11 2022-06-02 1.000000e-02 2022-05-19 6.817724e 03 9.999985e 01
12 2022-06-07 3.009440e 04 2022-05-24 1.452680e 02 -2.061648e 04
13 2022-06-09 4.222578e 04 2022-05-26 3.792992e-01 -1.113248e 07
14 2022-06-14 3.768863e 04 2022-05-31 3.430137e 01 -1.097750e 05
15 2022-06-17 3.073004e 04 2022-06-03 6.018889e 03 -4.105600e 02
16 2022-06-21 8.108902e 03 2022-06-07 3.009440e 04 7.305512e 01
17 2022-06-28 6.142686e 03 2022-06-14 3.768863e 04 8.370149e 01
18 2022-06-30 7.411646e 03 2022-06-16 3.304957e 04 7.757415e 01
19 2022-07-05 7.713191e 04 2022-06-21 8.108902e 03 -8.512005e 02
20 2022-07-07 2.366871e 04 2022-06-23 7.547126e 03 -2.136122e 02
21 2022-07-12 1.197320e 04 2022-06-28 6.142686e 03 -9.491797e 01
CodePudding user response:
Convert dat.N1
to a zoo series z
and merge it with all days. Then use na.approx
to fill in the NA days with interpolated values and finally use diff.zoo
with arith=FALSE
so that it takes ratios rather than differences. This gives a zoo series zz
and we subset it to just get the ratios associated with the original data. Use fortify.zoo(ratios)
if you need a data frame.
library(zoo)
z <- read.zoo(dat.N1)
m <- na.approx(merge(z, zoo(, seq(start(z), end(z), "day"))))
zz <- diff(m, 14, arith = FALSE, na.pad = TRUE)
ratios <- zz[time(z)]
Update
Added ratios which gives just the times in dat.N1.