Create a binary variable based also on the value of a year before r-CodePudding

In my dataset, I have the following variables:

gid = cell identifier
Year
Battle: count per year
Incidence: if at least one battle happened that year in that cell. For the construction of the incidence variable, I have used the following code: test$IncidenceBattles <-ifelse(Test$Battles>= 1,c(1), c(0))

I would like to create a binary variable OnsetBattle that equals 1 if we observe at least 1 battle in a particular year and none in the preceding year.

Example for cell 115593 year 2001. OnsetBattle will be equal to 1 because incidence battle = 1 in 2001 and there was no battle in 2000.

Note: It's OK if there are missing values. Especially before 1997.

subset of my dataset:

structure(list(gid = c(115593, 115593, 115593, 115593, 115593, 
115593, 115593, 115593, 115593, 115593, 115593, 115593, 115593, 
115593, 115593), Year = c(1996, 1997, 1998, 1999, 2000, 2001, 
2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010), Battles = c(NA, 
7, 9, 291, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0), IncidenceBattles = c(NA, 
1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0)), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -15L), groups = structure(list(
    gid = 115593, .rows = structure(list(1:15), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -1L), .drop = TRUE))

CodePudding user response：

df <- data.frame(gid = c(115593, 115593, 115593, 115593, 115593, 115593, 115593, 
                         115593, 115593, 115593, 115593, 115593, 115593, 115593, 115593), 
                 Year = c(1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 
                          2005, 2006, 2007, 2008, 2009, 2010), 
                 Battles = c(NA, 7, 9, 291, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0),
                 IncidenceBattles = c(NA, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0))
                                                                                                                                                                                                                                              gid = 115593, .rows = structure(list(1:15), ptype = integer(0), class = c("vctrs_list_of",                                                                                                                                                                                                                                                                                                               "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
# find years with no battles                                                                                                                                                                                                                                                                                                                        ), row.names = c(NA, -1L), .drop = TRUE))
idx <- which((df$Battles == 0) & (df$IncidenceBattles == 0))
# ignore first and last rows
idx <- setdiff(idx, c(1, nrow(df)))
# move the index forward 1 year
idx <- idx   1
# check the year after no-battle years to see if there are any battles
idx2 <- ((df[ idx, c('Battles', 'IncidenceBattles') ] |> rowSums()) > 0) |> which()
# retain years that have battles
idx <- idx[ idx2 ]
# initialize the variable to 0
df$OnsetBattle <- 0
# set to 1 for years with battles
df[ idx, 'OnsetBattle' ] <- 1
print(df)

CodePudding user response：

got it by applying a lag to my incidence variable

df %>% 
  ungroup() %>% 
  arrange(gid, Year) %>% 
  group_by(gid) %>% 
  mutate(lag_battles = Lag(IncidenceBattles,  1),
         OnsetBattle = ifelse(IncidenceBattles==1 & lag_battles==0, 1, 0))

CodePudding user response：

Take the difference of the IncidenceBattle values and use 1 if it equals 1. Note that IncidenceBattles could be defined as sign(Battles). The question did not specify how to handle the first two rows so we have used NA.

library(dplyr)

d %>%
  arrange(gid, Year) %>%
  group_by(gid) %>%
  mutate(OnsetBattle = c(NA,  (diff(IncidenceBattles) == 1))) %>%
  ungroup

giving:

# A tibble: 15 × 5
      gid  Year Battles IncidenceBattles OnsetBattle
    <dbl> <dbl>   <dbl>            <dbl>       <int>
 1 115593  1996      NA               NA          NA
 2 115593  1997       7                1          NA
 3 115593  1998       9                1           0
 4 115593  1999     291                1           0
 5 115593  2000       0                0           0
 6 115593  2001       5                1           1
 7 115593  2002       0                0           0
 8 115593  2003       0                0           0
 9 115593  2004       0                0           0
10 115593  2005       0                0           0
11 115593  2006       0                0           0
12 115593  2007       0                0           0
13 115593  2008       0                0           0
14 115593  2009       0                0           0
15 115593  2010       0                0           0