Home > database >  Avoid to subset the data and then merge to make calculations in dplyr
Avoid to subset the data and then merge to make calculations in dplyr

Time:01-21

I have a data set like df:

df <- data.frame(year = c("2000", "2000", "2000", "2002", "2007", "2001", "2002", "2004", "2007"), 
                 id = c("X", "X", "X", "X", "X", "Z", "Z", "Z", "Z"), 
                 product = c("apple","orange", "melon", "orange", "orange", "orange", "cake", "bacon", "truffels"), 
                 market = c("CHN", "USA", "USA", "CAN", "USA", "USA", "CHL", "CHL", "ECU"), 
                 value = c(1, 2, 3, 4, 5, 6, 7, 8, 9))

All I want to do is to create the variable years_id_consecutive which is a variable that counts the number of years that a given id shows up in the data in a consecutive way. Consecutive here is defined as showing up during the last 2 years before appearing in year t. My trial goes below:

df_panel <- df %>%
  mutate(year = as.numeric(year)) %>%
  group_by(id, year) %>%
  summarise(value=head(value,1)) %>% #take the first row by id-year
  ungroup() %>%
  complete(nesting(id), year = full_seq(year, period = 1)) #create a balanced panel


df_panel <- df_panel %>%
  group_by(id) %>%
  mutate(value_lag1 = lag(value, 1),
         value_lag2 = lag(value, 2),
         continuous = ifelse(!is.na(value) & !is.na(value_lag1) | !is.na(value) & !is.na(value_lag2), 1, 0)) %>%
  ungroup() %>%
  drop_na(value) %>%
  mutate(continuous_lag1 = lag(continuous, 1),
         years_id_consecutive = ifelse(continuous==0, 0, continuous continuous_lag1),
         year = as.character(year)) %>%
  select(-value_lag1, -value_lag2, -continuous, -continuous_lag1) 
  

df_new <- left_join(df, df_panel, by = c("id", "year", "value"))
df_new[is.na(df_new)] <- 0  

My trial in dplyr cuts the data and then merges the data with the original dataset. However, this method is very slow as my current data set is huge. Therefore I am looking for a solution in R base or a different method that is faster and integrated in one chunk of code.

CodePudding user response:

We don't need to use lag or complete, we can do it with base:rleand tidyr::map2. rle is to identify consecutive values and map2 to create sequences of consecutives years.

    suppressWarnings(library(tidyverse))
    
    df <- data.frame(year = c("2000", "2000", "2000", "2002", "2007", "2001", "2002", "2004", "2007"), 
                     id = c("X", "X", "X", "X", "X", "Z", "Z", "Z", "Z"), 
                     product = c("apple","orange", "melon", "orange", "orange", "orange", "cake", "bacon", "truffels"), 
                     market = c("CHN", "USA", "USA", "CAN", "USA", "USA", "CHL", "CHL", "ECU"), 
                     value = c(1, 2, 3, 4, 5, 6, 7, 8, 9))
    
    df_teste <- 
    df |> 
      arrange(id,year) |>
      group_by(id) |> 
      mutate(year = as.integer(year) ,
             id_consecutive = year %in% c(year   1, year   2),
             year = as.character(year))
    
    rle_year <- df_teste$id_consecutive |> rle() 
    #> Run Length Encoding
    #> lengths: int [1:5] 3 1 2 2 1
    #> values : logi [1:5] FALSE TRUE FALSE TRUE FALSE

    years_consecutive <- unlist( map2(rle_year$values, rle_year$lengths, ~ if(.x) seq(1,.y) else rep(0,.y)) )
    
    df_teste$years_consecutive <- years_consecutive
    df_teste
    #> # A tibble: 9 × 7
    #> # Groups:   id [2]
    #>   year  id    product  market value id_consecutive years_consecutive
    #>   <chr> <chr> <chr>    <chr>  <dbl> <lgl>                      <dbl>
    #> 1 2000  X     apple    CHN        1 FALSE                          0
    #> 2 2000  X     orange   USA        2 FALSE                          0
    #> 3 2000  X     melon    USA        3 FALSE                          0
    #> 4 2002  X     orange   CAN        4 TRUE                           1
    #> 5 2007  X     orange   USA        5 FALSE                          0
    #> 6 2001  Z     orange   USA        6 FALSE                          0
    #> 7 2002  Z     cake     CHL        7 TRUE                           1
    #> 8 2004  Z     bacon    CHL        8 TRUE                           2
    #> 9 2007  Z     truffels ECU        9 FALSE                          0

    Created on 2023-01-20 with reprex v2.0.2

years_consecutive isn't grouped, so is best to order the df before

EDIT

We can write everything in one chunk of code using nest

df |> 
  group_by(id) |> 
  mutate(year = as.integer(year) ,
         id_consecutive = year %in% c(year   1, year   2),
         year = as.character(year)) |> 
  nest() |> 
  mutate(rle_year = map(data, ~rle(.x$id_consecutive) ), 
         years_consecutive =
           map(rle_year, ~ unlist(map2(.x$values, .x$lengths, ~ if(.x) seq(1,.y) else rep(0,.y)) ) )
         ) |> 
  unnest(c(data, years_consecutive))
#> # A tibble: 9 × 8
#> # Groups:   id [2]
#>   id    year  product  market value id_consecutive rle_year years_consecutive
#>   <chr> <chr> <chr>    <chr>  <dbl> <lgl>          <list>               <dbl>
#> 1 X     2000  apple    CHN        1 FALSE          <rle>                    0
#> 2 X     2000  orange   USA        2 FALSE          <rle>                    0
#> 3 X     2000  melon    USA        3 FALSE          <rle>                    0
#> 4 X     2002  orange   CAN        4 TRUE           <rle>                    1
#> 5 X     2007  orange   USA        5 FALSE          <rle>                    0
#> 6 Z     2001  orange   USA        6 FALSE          <rle>                    0
#> 7 Z     2002  cake     CHL        7 TRUE           <rle>                    1
#> 8 Z     2004  bacon    CHL        8 TRUE           <rle>                    2
#> 9 Z     2007  truffels ECU        9 FALSE          <rle>                    0
  • Related