Home > Software engineering >  Identify first change in value in a dataframe and ignore subsequent changes
Identify first change in value in a dataframe and ignore subsequent changes

Time:01-10

I want to use R to identify when a criteria is met for the first time and ignore subsequent changes. Example data:

df <- data.frame(response = c(1, 1, 1, 0, 1, 0))

Note: first response always starts with 1.

Expected output

f <- data.frame(response = c(1, 1, 1, 0, 1, 0), Threshold = c("no", "no", "no", "yes", "no", "no"))

CodePudding user response:

Set all to "no", then find the first 0, and set that one to "yes":

df$Threshold <- "no"
df$Threshold[ which(df$response == 0)[ 1 ] ] <- "yes"
# df
#   response Threshold
# 1        1        no
# 2        1        no
# 3        1        no
# 4        0       yes
# 5        1        no
# 6        0        no

CodePudding user response:

using @zx8754 advice

data.table

df <-
  data.frame(
    response = c(1, 1, 1, 0, 1, 0),
    Threshold = c("no", "no", "no", "yes", "no", "no")
  )

library(data.table)
library(magrittr)
setDT(df)[, Threshold_new := "no"] %>% 
  .[response == 0, Threshold_new := fifelse(cumsum(response == 0) == 1, "yes", Threshold_new)] %>% 
  .[]
#>    response Threshold Threshold_new
#> 1:        1        no            no
#> 2:        1        no            no
#> 3:        1        no            no
#> 4:        0       yes           yes
#> 5:        1        no            no
#> 6:        0        no            no

Created on 2023-01-09 with reprex v2.0.2

CodePudding user response:

You can use match to get the first 0.

df$Threshold <- "no"
df$Threshold[match(0, df$response)] <- "yes"

df
#  response Threshold
#1        1        no
#2        1        no
#3        1        no
#4        0       yes
#5        1        no
#6        0        no

In case to speed it up and reduce memory consumption, but basically the same:

df$Threshold <- `[<-`(rep("no", nrow(df)), match(0, df$response), "yes")
#df$Threshold <- replace(rep("no", nrow(df)), match(0, df$response), "yes") #Alternative using relpace

Just for fun a Benchmark:

set.seed(42)
n <- 1e6
DF <- data.frame(response = c(1, sample(0:1, n, TRUE)))

library(data.table) #For Yuriy Saraykin
library(magrittr)   #For Yuriy Saraykin

bench::mark(check = FALSE,
zx8754 = {df <- DF; df$Threshold <- "no"
  df$Threshold[ which(df$response == 0)[ 1 ] ] <- "yes"}
, "Yuriy Saraykin" = {df <- DF; setDT(df)[, Threshold := "no"] %>% 
  .[response == 0, Threshold := fifelse(cumsum(response == 0) == 1, "yes", Threshold)]}
, GKi = {df <- DF; df$Threshold <- "no"
  df$Threshold[match(0, df$response)] <- "yes"}
, GKi2 = {df <- DF
  df$Threshold <- `[<-`(rep("no", nrow(df)), match(0, df$response), "yes")}
)
#  expression          min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
#  <bch:expr>     <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
#1 zx8754           26.9ms     27ms      37.1    47.7MB    315.      2    17
#2 Yuriy Saraykin     60ms     62ms      16.1    38.3MB     64.5     2     8
#3 GKi              21.8ms   21.9ms      45.7    45.8MB    320.      3    21
#4 GKi2             13.8ms   13.8ms      72.2    30.5MB     91.5    15    19

In this case GKi2 is the fastest method and uses less memory, compared to the other methods


Doing it per group (as requested in the comment).

df <- data.frame(response = c(1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0),
       spec = c("a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b"))
df$Threshold <- ave(df$response, df$spec, FUN=\(x)
   replace(rep("no", length(x)), match(0, x), "yes"))

df
#   response spec Threshold
#1         1    a        no
#2         1    a        no
#3         1    a        no
#4         0    a       yes
#5         1    a        no
#6         0    a        no
#7         1    b        no
#8         0    b       yes
#9         0    b        no
#10        1    b        no
#11        0    b        no
#12        0    b        no
  •  Tags:  
  • r
  • Related