I want to use R to identify when a criteria is met for the first time and ignore subsequent changes. Example data:
df <- data.frame(response = c(1, 1, 1, 0, 1, 0))
Note: first response always starts with 1.
Expected output
f <- data.frame(response = c(1, 1, 1, 0, 1, 0), Threshold = c("no", "no", "no", "yes", "no", "no"))
CodePudding user response:
Set all to "no", then find the first 0, and set that one to "yes":
df$Threshold <- "no"
df$Threshold[ which(df$response == 0)[ 1 ] ] <- "yes"
# df
# response Threshold
# 1 1 no
# 2 1 no
# 3 1 no
# 4 0 yes
# 5 1 no
# 6 0 no
CodePudding user response:
using @zx8754 advice
data.table
df <-
data.frame(
response = c(1, 1, 1, 0, 1, 0),
Threshold = c("no", "no", "no", "yes", "no", "no")
)
library(data.table)
library(magrittr)
setDT(df)[, Threshold_new := "no"] %>%
.[response == 0, Threshold_new := fifelse(cumsum(response == 0) == 1, "yes", Threshold_new)] %>%
.[]
#> response Threshold Threshold_new
#> 1: 1 no no
#> 2: 1 no no
#> 3: 1 no no
#> 4: 0 yes yes
#> 5: 1 no no
#> 6: 0 no no
Created on 2023-01-09 with reprex v2.0.2
CodePudding user response:
You can use match
to get the first 0
.
df$Threshold <- "no"
df$Threshold[match(0, df$response)] <- "yes"
df
# response Threshold
#1 1 no
#2 1 no
#3 1 no
#4 0 yes
#5 1 no
#6 0 no
In case to speed it up and reduce memory consumption, but basically the same:
df$Threshold <- `[<-`(rep("no", nrow(df)), match(0, df$response), "yes")
#df$Threshold <- replace(rep("no", nrow(df)), match(0, df$response), "yes") #Alternative using relpace
Just for fun a Benchmark:
set.seed(42)
n <- 1e6
DF <- data.frame(response = c(1, sample(0:1, n, TRUE)))
library(data.table) #For Yuriy Saraykin
library(magrittr) #For Yuriy Saraykin
bench::mark(check = FALSE,
zx8754 = {df <- DF; df$Threshold <- "no"
df$Threshold[ which(df$response == 0)[ 1 ] ] <- "yes"}
, "Yuriy Saraykin" = {df <- DF; setDT(df)[, Threshold := "no"] %>%
.[response == 0, Threshold := fifelse(cumsum(response == 0) == 1, "yes", Threshold)]}
, GKi = {df <- DF; df$Threshold <- "no"
df$Threshold[match(0, df$response)] <- "yes"}
, GKi2 = {df <- DF
df$Threshold <- `[<-`(rep("no", nrow(df)), match(0, df$response), "yes")}
)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#1 zx8754 26.9ms 27ms 37.1 47.7MB 315. 2 17
#2 Yuriy Saraykin 60ms 62ms 16.1 38.3MB 64.5 2 8
#3 GKi 21.8ms 21.9ms 45.7 45.8MB 320. 3 21
#4 GKi2 13.8ms 13.8ms 72.2 30.5MB 91.5 15 19
In this case GKi2
is the fastest method and uses less memory, compared to the other methods
Doing it per group (as requested in the comment).
df <- data.frame(response = c(1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0),
spec = c("a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b"))
df$Threshold <- ave(df$response, df$spec, FUN=\(x)
replace(rep("no", length(x)), match(0, x), "yes"))
df
# response spec Threshold
#1 1 a no
#2 1 a no
#3 1 a no
#4 0 a yes
#5 1 a no
#6 0 a no
#7 1 b no
#8 0 b yes
#9 0 b no
#10 1 b no
#11 0 b no
#12 0 b no