I'm looking for instances in my dataset where 4 of the last 6 samples show progressively increasing concentrations. I've worked out the logic of a series of if statements but I'm having trouble applying it to my data. I was planning on using cbind to attach the list to my dataset but the list ends up with 28 values and my data only has 24 rows.
I can't figure out what's happening with the for loop and have read that it's not a great way to do things anyways so I'm looking for alternatives.
EDIT: I've added some photos of two specific examples where suggested answers fail. The trouble seems to be that the "simple" solutions look for increases between consecutive points only. I'm looking for four points over each set of six that increase.
Here is some of the data:
SAMPLE_DATE <- c("2013-08-02", "2014-06-13", "2015-09-03", "2016-06-12", "2016-09-27", "2017-05-30", "2017-05-30", "2017-09-14", "2017-09-14", "2017-12-02", "2018-03-29", "2018-06-05", "2018-10-19", "2019-02-27", "2019-06-04", "2019-08-28", "2019-10-22", "2020-02-04", "2020-06-06", "2020-08-26", "2020-10-23", "2021-02-01", "2021-06-15", "2021-08-03")
REPORT_RESULT_VALUE <- c(0.1470, 0.0623, 1.4600, 0.1810, 0.0509, 0.0801, 0.0801, 0.0999, 0.0980, 0.0820, 0.0698, 0.0884, 0.1060, 0.1010, 0.0984, 0.1050, 0.1100, 0.0980, 0.1000, 0.1090, 0.1050, 0.0662, 0.0944, 0.1220)
GWSubsetData <- data.frame(SAMPLE_DATE, REPORT_RESULT_VALUE)
And here is what I've attempted:
Groundwater_ST1 <- vector("list")
for (i in seq_along(GWSubsetData$REPORT_RESULT_VALUE)) {
if (i >= 6) {
a <- i-5
b <- i-4
c <- i-3
d <- i-2
e <- i-1
#If i > 3 of first 5 samples
if (sum(GWSubsetData$REPORT_RESULT_VALUE[[i]] > GWSubsetData$REPORT_RESULT_VALUE[a:e]) >= 3) {
#If i>E and E > 2 of first 4 samples
if ((GWSubsetData$REPORT_RESULT_VALUE[[i]] > GWSubsetData$REPORT_RESULT_VALUE[[e]]) &
(sum(GWSubsetData$REPORT_RESULT_VALUE[[e]] > GWSubsetData$REPORT_RESULT_VALUE[a:d]) > 2)) {
#if E>D and D > 1 of first 3 samples
if ((GWSubsetData$REPORT_RESULT_VALUE[[e]] > GWSubsetData$REPORT_RESULT_VALUE[[d]]) &
(sum(GWSubsetData$REPORT_RESULT_VALUE[[d]] > GWSubsetData$REPORT_RESULT_VALUE[a:c]) >= 1)) {
Groundwater_ST1[i] = TRUE
#If E>C and C > 1 of first 2 sampels
}else if ((GWSubsetData$REPORT_RESULT_VALUE[[e]] > GWSubsetData$REPORT_RESULT_VALUE[[c]]) &
(sum(GWSubsetData$REPORT_RESULT_VALUE[[c]] > GWSubsetData$REPORT_RESULT_VALUE[a:b]) >= 1)) {
Groundwater_ST1[i] = TRUE
#If E>B and B>A
}else if ((GWSubsetData$REPORT_RESULT_VALUE[[e]] > GWSubsetData$REPORT_RESULT_VALUE[[b]]) &
(GWSubsetData$REPORT_RESULT_VALUE[[b]] > GWSubsetData$REPORT_RESULT_VALUE[[a]])) {
Groundwater_ST1[i] = TRUE
}else{
Groundwater_ST1[i] = FALSE
}
#If i>D and D > 2 of first 3 samples
}else if ((GWSubsetData$REPORT_RESULT_VALUE[[i]] > GWSubsetData$REPORT_RESULT_VALUE[[d]]) &
(sum(GWSubsetData$REPORT_RESULT_VALUE[[d]] > GWSubsetData$REPORT_RESULT_VALUE[a:c]) >= 2)) {
#If D>C and C > 1 of first 2 samples
if ((GWSubsetData$REPORT_RESULT_VALUE[[d]] > GWSubsetData$REPORT_RESULT_VALUE[[c]]) &
(sum(GWSubsetData$REPORT_RESULT_VALUE[[c]] > GWSubsetData$REPORT_RESULT_VALUE[a:b]) >= 1)) {
Groundwater_ST1[i] = TRUE
#If D>B and B>A
}else if ((GWSubsetData$REPORT_RESULT_VALUE[[d]] > GWSubsetData$REPORT_RESULT_VALUE[[b]]) &
(GWSubsetData$REPORT_RESULT_VALUE[[b]] > GWSubsetData$REPORT_RESULT_VALUE[[a]])) {
Groundwater_ST1[i] = TRUE
}else{
Groundwater_ST1[i] = FALSE
}
#If i > c > b > a
}else if ((GWSubsetData$REPORT_RESULT_VALUE[[i]] > GWSubsetData$REPORT_RESULT_VALUE[[c]]) &
(GWSubsetData$REPORT_RESULT_VALUE[[c]] > GWSubsetData$REPORT_RESULT_VALUE[b]) &
(GWSubsetData$REPORT_RESULT_VALUE[[b]] > GWSubsetData$REPORT_RESULT_VALUE[[a]])) {
Groundwater_ST1[i] = TRUE
}else{
Groundwater_ST1[i] = FALSE
}
}else{
Groundwater_ST1[i] = FALSE }
}else{
Groundwater_ST1[i] = FALSE
}
}
CodePudding user response:
Would this work? I'm assuming that if the change between a value and its previous value is greater than 0, there was an increase and if this happens 4 or more times within six consecutive values, your condition Groundwater_ST1 == TRUE:
library(dplyr)
library(RcppRoll)
dat %>%
mutate(change = dat$val - lag(dat$val)) %>%
mutate(incr = change > 0) %>%
mutate(roll_sum = roll_sum(incr, 6, align = "right", fill = NA)) %>%
mutate(Groundwater_ST1 = roll_sum >= 4)
result:
date val change incr roll_sum Groundwater_ST1
1 2013-08-02 0.1470 NA NA NA NA
2 2014-06-13 0.0623 -0.0847 FALSE NA NA
3 2015-09-03 1.4600 1.3977 TRUE NA NA
4 2016-06-12 0.1810 -1.2790 FALSE NA NA
5 2016-09-27 0.0509 -0.1301 FALSE NA NA
6 2017-05-30 0.0801 0.0292 TRUE NA NA
7 2017-05-30 0.0801 0.0000 FALSE 2 FALSE
8 2017-09-14 0.0999 0.0198 TRUE 3 FALSE
9 2017-09-14 0.0980 -0.0019 FALSE 2 FALSE
10 2017-12-02 0.0820 -0.0160 FALSE 2 FALSE
11 2018-03-29 0.0698 -0.0122 FALSE 2 FALSE
12 2018-06-05 0.0884 0.0186 TRUE 2 FALSE
13 2018-10-19 0.1060 0.0176 TRUE 3 FALSE
14 2019-02-27 0.1010 -0.0050 FALSE 2 FALSE
15 2019-06-04 0.0984 -0.0026 FALSE 2 FALSE
16 2019-08-28 0.1050 0.0066 TRUE 3 FALSE
17 2019-10-22 0.1100 0.0050 TRUE 4 TRUE
18 2020-02-04 0.0980 -0.0120 FALSE 3 FALSE
19 2020-06-06 0.1000 0.0020 TRUE 3 FALSE
20 2020-08-26 0.1090 0.0090 TRUE 4 TRUE
21 2020-10-23 0.1050 -0.0040 FALSE 4 TRUE
22 2021-02-01 0.0662 -0.0388 FALSE 3 FALSE
23 2021-06-15 0.0944 0.0282 TRUE 3 FALSE
24 2021-08-03 0.1220 0.0276 TRUE 4 TRUE
data:
dat <- data.frame(
date =c("2013-08-02", "2014-06-13", "2015-09-03", "2016-06-12", "2016-09-27", "2017-05-30", "2017-05-30", "2017-09-14", "2017-09-14", "2017-12-02", "2018-03-29", "2018-06-05", "2018-10-19", "2019-02-27", "2019-06-04", "2019-08-28", "2019-10-22", "2020-02-04", "2020-06-06", "2020-08-26", "2020-10-23", "2021-02-01", "2021-06-15", "2021-08-03"),
val = c(0.1470, 0.0623, 1.4600, 0.1810, 0.0509, 0.0801, 0.0801, 0.0999, 0.0980, 0.0820, 0.0698, 0.0884, 0.1060, 0.1010, 0.0984, 0.1050, 0.1100, 0.0980, 0.1000, 0.1090, 0.1050, 0.0662, 0.0944, 0.1220))
CodePudding user response:
I think we could do this by counting the number of cumulative increases, then looking to see how the increases have changed over a window of 6 values.
library(dplyr)
GWSubsetData %>%
mutate(increases = cumsum(REPORT_RESULT_VALUE > lag(REPORT_RESULT_VALUE, default = 0)),
n_incr_last_6 = increases - lag(increases, 6, default = 0),
flag = n_incr_last_6 >= 4)
SAMPLE_DATE REPORT_RESULT_VALUE increases n_incr_last_6 flag
1 2013-08-02 0.1470 1 1 FALSE
2 2014-06-13 0.0623 1 1 FALSE
3 2015-09-03 1.4600 2 2 FALSE
4 2016-06-12 0.1810 2 2 FALSE
5 2016-09-27 0.0509 2 2 FALSE
6 2017-05-30 0.0801 3 3 FALSE
7 2017-05-30 0.0801 3 2 FALSE
8 2017-09-14 0.0999 4 3 FALSE
9 2017-09-14 0.0980 4 2 FALSE
10 2017-12-02 0.0820 4 2 FALSE
11 2018-03-29 0.0698 4 2 FALSE
12 2018-06-05 0.0884 5 2 FALSE
13 2018-10-19 0.1060 6 3 FALSE
14 2019-02-27 0.1010 6 2 FALSE
15 2019-06-04 0.0984 6 2 FALSE
16 2019-08-28 0.1050 7 3 FALSE
17 2019-10-22 0.1100 8 4 TRUE
18 2020-02-04 0.0980 8 3 FALSE
19 2020-06-06 0.1000 9 3 FALSE
20 2020-08-26 0.1090 10 4 TRUE
21 2020-10-23 0.1050 10 4 TRUE
22 2021-02-01 0.0662 10 3 FALSE
23 2021-06-15 0.0944 11 3 FALSE
24 2021-08-03 0.1220 12 4 TRUE
Base R equivalent:
GWSubsetData$incr = cumsum(c(1, diff(GWSubsetData$REPORT_RESULT_VALUE) > 0))
GWSubsetData$flag = (GWSubsetData$incr - lag(GWSubsetData$incr, 6, default = 0)) >= 4