Home > Net >  Create conditional variables grouping by time and id
Create conditional variables grouping by time and id

Time:08-05

I have a data frame that contains the product and destination (dest_prod) consumed by individuals (id), observed during different years (year) in the data frame df:

id <- c("1", "1", "1", "1", "2", "2", "2", "2", "3", "3", "3", "3")
year <- c("2000", "2001", "2002", "2003", "2002", "2003", "2004", "2005", "2001", "2002", "2003", "2005")
dest_prod <- c("FRA_coke", "FRA_coke", "USA_coke", "FRA_coke USA_coke", 
               "FRA_coke", "SPA_ham", "FRA_coke SPA_ham", "GER_coke",
               "CHN_water", "CHN_oil", "CHN_water CHN_oil FRA_coke", "CHN_water CHN_oil FRA_coke")

df <- data.frame(id, year, dest_prod)

I would like to apply the following logic:

  1. Create a new variable "new_dest" if an individual consumes a prod_dest in year (t) that comes from a new destination (3 first letters of "dest_prod") than in year (t-1).
  2. Create a new variable "new_prod" if an individual consumes a prod_dest in year (t) that contains a new product (letters after the "_" symbol in "dest_prod") than in year (t-1).

Note!: If there is no observed year before (t) for a given individual or the last observation comes from 2 years or longer, I will assign "NA" value.

The new dataframe looks like df_mod:

new_dest <- c("NA", "0", "1", "1",
             "NA", "1", "1", "1",
             "NA", "0", "1", "NA")

new_prod <- c("NA", "0", "0", "0",
             "NA", "1", "1", "0",
             "NA", "1", "1", "NA")

df_mod <- data.frame(id, year, dest_prod, new_dest, new_prod)

I have tried to use ifelse in dplyr, but it does not work as it is impossible to capture the sense of time and group (id). Any idea on how to create new_dest and new_prod in dplyr?

CodePudding user response:

The first step is to parse the dest_prod column into a usable format:

library(tidyverse)

parsed <- df %>% 
  mutate_at(c("id", "year"), as.integer) %>% 
  separate_rows(dest_prod, sep = "[ ]") %>% 
  separate(dest_prod, into = c("dest", "prod"))

parsed
#> # A tibble: 18 × 4
#>       id  year dest  prod 
#>    <int> <int> <chr> <chr>
#>  1     1  2000 FRA   coke 
#>  2     1  2001 FRA   coke 
#>  3     1  2002 USA   coke 
#>  4     1  2003 FRA   coke 
#>  5     1  2003 USA   coke 
#>  6     2  2002 FRA   coke 
#>  7     2  2003 SPA   ham  
#>  8     2  2004 FRA   coke 
#>  9     2  2004 SPA   ham  
#> 10     2  2005 GER   coke 
#> 11     3  2001 CHN   water
#> 12     3  2002 CHN   oil  
#> 13     3  2003 CHN   water
#> 14     3  2003 CHN   oil  
#> 15     3  2003 FRA   coke 
#> 16     3  2005 CHN   water
#> 17     3  2005 CHN   oil  
#> 18     3  2005 FRA   coke

Then:

parsed %>% 
  # Add previous year's data in a list column
  nest_join(
    mutate(., year = year   1L),
    name = "previous",
    by = c("id", "year")
  ) %>% 
  # Check if any dest/prod appeared in previous year
  rowwise() %>% 
  mutate(
    new_dest = !dest %in% previous$dest,
    new_prod = !prod %in% previous$prod,
  ) %>% 
  # Set to missing if previous year was not observed
  mutate(
    across(c(new_dest, new_prod), replace, !nrow(previous), NA)
  ) %>% 
  # Aggregate indicators on year level
  group_by(id, year) %>% 
  summarise(
    new_dest = any(new_dest),
    new_prod = any(new_prod),
  )
#> # A tibble: 12 × 4
#> # Groups:   id [3]
#>       id  year new_dest new_prod
#>    <int> <int> <lgl>    <lgl>   
#>  1     1  2000 NA       NA      
#>  2     1  2001 FALSE    FALSE   
#>  3     1  2002 TRUE     FALSE   
#>  4     1  2003 TRUE     FALSE   
#>  5     2  2002 NA       NA      
#>  6     2  2003 TRUE     TRUE    
#>  7     2  2004 TRUE     TRUE    
#>  8     2  2005 TRUE     FALSE   
#>  9     3  2001 NA       NA      
#> 10     3  2002 FALSE    TRUE    
#> 11     3  2003 TRUE     TRUE    
#> 12     3  2005 NA       NA

CodePudding user response:

If you do want to fight the way dplyr wants to work, here's a non-pivot solution :)

library(dplyr)
library(purrr)
library(stringr)

df %>%
  mutate(year = as.numeric(year)) %>%
  left_join(., mutate(., year = year   1), by = c('id', 'year'), suffix = c('', '_previous')) %>%
  rowwise() %>%
  mutate(dests = str_extract_all(dest_prod, '\\w{3}_'),
         prods = str_extract_all(dest_prod, '_\\w '),
         new_dest = max(map_int(dests, \(x) !str_detect(dest_prod_previous, x))),
         new_prod = max(map_int(prods, \(x) !str_detect(dest_prod_previous, x)))) %>%
  select(-c(dests, prods, dest_prod_previous)) %>%
  ungroup()

#> # A tibble: 12 × 5
#>    id     year dest_prod                  new_dest new_prod
#>    <chr> <dbl> <chr>                         <int>    <int>
#>  1 1      2000 FRA_coke                         NA       NA
#>  2 1      2001 FRA_coke                          0        0
#>  3 1      2002 USA_coke                          1        0
#>  4 1      2003 FRA_coke USA_coke                 1        0
#>  5 2      2002 FRA_coke                         NA       NA
#>  6 2      2003 SPA_ham                           1        1
#>  7 2      2004 FRA_coke SPA_ham                  1        1
#>  8 2      2005 GER_coke                          1        0
#>  9 3      2001 CHN_water                        NA       NA
#> 10 3      2002 CHN_oil                           0        1
#> 11 3      2003 CHN_water CHN_oil FRA_coke        1        1
#> 12 3      2005 CHN_water CHN_oil FRA_coke       NA       NA
  • Related