Home > database >  Getting data from another column from a row where a different variable matches
Getting data from another column from a row where a different variable matches

Time:05-07

Minimal working example:

library(dplyr)
df = data.frame(group_id = c("G1","G1", rep("G2",8)),
                prod_id = c(1,2,5,6,7,8,9,10,11,12),
                prod_type = rep(c("a","a", "b", "c","d"),2),
                start = lubridate::dmy(c("01/01/2001", "02/02/2002",
                                         "05/05/2005","06/06/2006", "07/07/2007", "08/08/2008", "09/09/2009","01/01/2010", "02/02/2011", "03/03/2012" )))%>%
  group_by(group_id) %>%
  mutate(next_acd = if_else(lead(prod_type) %in% c("a","c","d"), lead(prod_id), prod_id[NA])) %>%
  tidyr::fill(next_acd, .direction = "up")

df:

group_id prod_id prod_type start      next_acd
   <chr>      <dbl> <chr>     <date>        <dbl>
 1 G1             1 a         2001-01-01        2
 2 G1             2 a         2002-02-02       NA
 3 G2             5 b         2005-05-05        6
 4 G2             6 c         2006-06-06        7
 5 G2             7 d         2007-07-07        8
 6 G2             8 a         2008-08-08        9
 7 G2             9 a         2009-09-09       11
 8 G2            10 b         2010-01-01       11
 9 G2            11 c         2011-02-02       12
10 G2            12 d         2012-03-03       NA

Where next_acd is the next prod_id in that group which is prod_type "a", "c" or "d".

I now want to create a new column fin which is the start date of the next prod_type "a", "c" or "d". I.e. I want to match the next_acd to prod_id and get the corresponding start.

I have tried the answer given on another question Get data from variable z where variable x = variable y

df$fin <- df$start[df$prod_id[df$next_acd]]

df:

 group_id prod_id prod_type start      next_acd fin       
   <chr>      <dbl> <chr>     <date>        <dbl> <date>    
 1 G1             1 a         2001-01-01        2 2002-02-02
 2 G1             2 a         2002-02-02       NA NA        
 3 G2             5 b         2005-05-05        6 2010-01-01
 4 G2             6 c         2006-06-06        7 2011-02-02
 5 G2             7 d         2007-07-07        8 2012-03-03
 6 G2             8 a         2008-08-08        9 NA        
 7 G2             9 a         2009-09-09       11 NA        
 8 G2            10 b         2010-01-01       11 NA        
 9 G2            11 c         2011-02-02       12 NA        
10 G2            12 d         2012-03-03       NA NA 

I'm not sure what is going on here. fin is only correctly entered for row 1.

This is what the output should be:

group_id prod_id prod_type start      fin        next_acd
   <chr>      <dbl> <chr>     <date>     <date>        <dbl>
 1 G1             1 a         2001-01-01 2002-02-02        2
 2 G1             2 a         2002-02-02 NA               NA
 3 G2             5 b         2005-05-05 2006-06-06        6
 4 G2             6 c         2006-06-06 2007-07-07        7
 5 G2             7 d         2007-07-07 2008-08-08        8
 6 G2             8 a         2008-08-08 2009-09-09        9
 7 G2             9 a         2009-09-09 2011-02-02       11
 8 G2            10 b         2010-01-01 2011-02-02       11
 9 G2            11 c         2011-02-02 2012-03-03       12
10 G2            12 d         2012-03-03 NA               NA

CodePudding user response:

A tidyverse option:

library(tidyverse)

df <- data.frame(
  group_id = c("G1", "G1", rep("G2", 8)),
  prod_id = c(1, 2, 5, 6, 7, 8, 9, 10, 11, 12),
  prod_type = rep(c("a", "a", "b", "c", "d"), 2),
  start = lubridate::dmy(c(
    "01/01/2001", "02/02/2002",
    "05/05/2005", "06/06/2006", "07/07/2007", "08/08/2008", "09/09/2009", "01/01/2010", "02/02/2011", "03/03/2012"
  ))
) %>%
  group_by(group_id) %>%
  mutate(next_acd = if_else(lead(prod_type) %in% c("a", "c", "d"), lead(prod_id), prod_id[NA])) %>%
  fill(next_acd, .direction = "up")

df %>%
  group_by(group_id) %>%
  mutate(fin = start[match(next_acd, prod_id, nomatch = NA_real_)])
#> # A tibble: 10 × 6
#> # Groups:   group_id [2]
#>    group_id prod_id prod_type start      next_acd fin       
#>    <chr>      <dbl> <chr>     <date>        <dbl> <date>    
#>  1 G1             1 a         2001-01-01        2 2002-02-02
#>  2 G1             2 a         2002-02-02       NA NA        
#>  3 G2             5 b         2005-05-05        6 2006-06-06
#>  4 G2             6 c         2006-06-06        7 2007-07-07
#>  5 G2             7 d         2007-07-07        8 2008-08-08
#>  6 G2             8 a         2008-08-08        9 2009-09-09
#>  7 G2             9 a         2009-09-09       11 2011-02-02
#>  8 G2            10 b         2010-01-01       11 2011-02-02
#>  9 G2            11 c         2011-02-02       12 2012-03-03
#> 10 G2            12 d         2012-03-03       NA NA

Created on 2022-05-06 by the reprex package (v2.0.1)

CodePudding user response:

Based on the suggestion to use a merge:

df1= df[,c("prod_id","start")]%>%rename(fin = start)
df2= df[,c("group_id", "prod_type","prod_id", "start","next_acd")]

left_join(df2, df1, by= c("next_acd"="prod_id"))

group_id prod_type prod_id start      next_acd fin       
   <chr>    <chr>       <dbl> <date>        <dbl> <date>    
 1 G1       a               1 2001-01-01        2 2002-02-02
 2 G1       a               2 2002-02-02       NA NA        
 3 G2       b               5 2005-05-05        6 2006-06-06
 4 G2       c               6 2006-06-06        7 2007-07-07
 5 G2       d               7 2007-07-07        8 2008-08-08
 6 G2       a               8 2008-08-08        9 2009-09-09
 7 G2       a               9 2009-09-09       11 2011-02-02
 8 G2       b              10 2010-01-01       11 2011-02-02
 9 G2       c              11 2011-02-02       12 2012-03-03
10 G2       d              12 2012-03-03       NA NA     
  • Related