I have a data set like df:
df <- data.frame(year = c("2000", "2000", "2000", "2002", "2007", "2001", "2002", "2004", "2007"),
id = c("X", "X", "X", "X", "X", "Z", "Z", "Z", "Z"),
product = c("apple","orange", "melon", "orange", "orange", "orange", "cake", "bacon", "truffels"),
market = c("CHN", "USA", "USA", "CAN", "USA", "USA", "CHL", "CHL", "ECU"),
value = c(1, 2, 3, 4, 5, 6, 7, 8, 9))
All I want to do is to create the variable years_id_consecutive which is a variable that counts the number of years that a given id shows up in the data in a consecutive way. Consecutive here is defined as showing up during the last 2 years before appearing in year t. My trial goes below:
df_panel <- df %>%
mutate(year = as.numeric(year)) %>%
group_by(id, year) %>%
summarise(value=head(value,1)) %>% #take the first row by id-year
ungroup() %>%
complete(nesting(id), year = full_seq(year, period = 1)) #create a balanced panel
df_panel <- df_panel %>%
group_by(id) %>%
mutate(value_lag1 = lag(value, 1),
value_lag2 = lag(value, 2),
continuous = ifelse(!is.na(value) & !is.na(value_lag1) | !is.na(value) & !is.na(value_lag2), 1, 0)) %>%
ungroup() %>%
drop_na(value) %>%
mutate(continuous_lag1 = lag(continuous, 1),
years_id_consecutive = ifelse(continuous==0, 0, continuous continuous_lag1),
year = as.character(year)) %>%
select(-value_lag1, -value_lag2, -continuous, -continuous_lag1)
df_new <- left_join(df, df_panel, by = c("id", "year", "value"))
df_new[is.na(df_new)] <- 0
My trial in dplyr cuts the data and then merges the data with the original dataset. However, this method is very slow as my current data set is huge. Therefore I am looking for a solution in R base or a different method that is faster and integrated in one chunk of code.
CodePudding user response:
We don't need to use lag or complete, we can do it with base:rle
and tidyr::map2
. rle
is to identify consecutive values and map2
to create sequences of consecutives years.
suppressWarnings(library(tidyverse))
df <- data.frame(year = c("2000", "2000", "2000", "2002", "2007", "2001", "2002", "2004", "2007"),
id = c("X", "X", "X", "X", "X", "Z", "Z", "Z", "Z"),
product = c("apple","orange", "melon", "orange", "orange", "orange", "cake", "bacon", "truffels"),
market = c("CHN", "USA", "USA", "CAN", "USA", "USA", "CHL", "CHL", "ECU"),
value = c(1, 2, 3, 4, 5, 6, 7, 8, 9))
df_teste <-
df |>
arrange(id,year) |>
group_by(id) |>
mutate(year = as.integer(year) ,
id_consecutive = year %in% c(year 1, year 2),
year = as.character(year))
rle_year <- df_teste$id_consecutive |> rle()
#> Run Length Encoding
#> lengths: int [1:5] 3 1 2 2 1
#> values : logi [1:5] FALSE TRUE FALSE TRUE FALSE
years_consecutive <- unlist( map2(rle_year$values, rle_year$lengths, ~ if(.x) seq(1,.y) else rep(0,.y)) )
df_teste$years_consecutive <- years_consecutive
df_teste
#> # A tibble: 9 × 7
#> # Groups: id [2]
#> year id product market value id_consecutive years_consecutive
#> <chr> <chr> <chr> <chr> <dbl> <lgl> <dbl>
#> 1 2000 X apple CHN 1 FALSE 0
#> 2 2000 X orange USA 2 FALSE 0
#> 3 2000 X melon USA 3 FALSE 0
#> 4 2002 X orange CAN 4 TRUE 1
#> 5 2007 X orange USA 5 FALSE 0
#> 6 2001 Z orange USA 6 FALSE 0
#> 7 2002 Z cake CHL 7 TRUE 1
#> 8 2004 Z bacon CHL 8 TRUE 2
#> 9 2007 Z truffels ECU 9 FALSE 0
Created on 2023-01-20 with reprex v2.0.2
years_consecutive
isn't grouped, so is best to order the df before
EDIT
We can write everything in one chunk of code using nest
df |>
group_by(id) |>
mutate(year = as.integer(year) ,
id_consecutive = year %in% c(year 1, year 2),
year = as.character(year)) |>
nest() |>
mutate(rle_year = map(data, ~rle(.x$id_consecutive) ),
years_consecutive =
map(rle_year, ~ unlist(map2(.x$values, .x$lengths, ~ if(.x) seq(1,.y) else rep(0,.y)) ) )
) |>
unnest(c(data, years_consecutive))
#> # A tibble: 9 × 8
#> # Groups: id [2]
#> id year product market value id_consecutive rle_year years_consecutive
#> <chr> <chr> <chr> <chr> <dbl> <lgl> <list> <dbl>
#> 1 X 2000 apple CHN 1 FALSE <rle> 0
#> 2 X 2000 orange USA 2 FALSE <rle> 0
#> 3 X 2000 melon USA 3 FALSE <rle> 0
#> 4 X 2002 orange CAN 4 TRUE <rle> 1
#> 5 X 2007 orange USA 5 FALSE <rle> 0
#> 6 Z 2001 orange USA 6 FALSE <rle> 0
#> 7 Z 2002 cake CHL 7 TRUE <rle> 1
#> 8 Z 2004 bacon CHL 8 TRUE <rle> 2
#> 9 Z 2007 truffels ECU 9 FALSE <rle> 0