Calculating growth rates across rows by groups in the long format (relative to a base group) in R-CodePudding

I would like to calculate growth rates across rows by groups (here groups are "country" and "year") in a dataset that is in the long format. Since the growth rate should be relative to the same value (i.e., same "country" and same "year") under the "Baseline" scenario (not relative to the previous row), I am doing this by changing the data format to wide format, as follows:

df <- spread(df, scenario, value) %>% 
    mutate(NDC_growth=((NDC/Baseline)-1)*100,
       `Partial BCA_growth`=((`Partial BCA`/Baseline)-1)*100,
       BCA_growth=((BCA/Baseline)-1)*100,
       `Full BCA_growth`=((`Full BCA`/Baseline)-1)*100 )

Is there a way to do it in the long format?

Here is the data:

    df<- structure(list(country = c("CAN", "CAN", "CAN", "CAN", "CAN", 
"CAN", "CAN", "CAN", "CAN", "CAN", "CAN", "CAN", "CAN", "CAN", 
"CAN", "NCOA", "NCOA", "NCOA", "NCOA", "NCOA"), year = c("2020", 
"2020", "2020", "2020", "2020", "2025", "2025", "2025", "2025", 
"2025", "2030", "2030", "2030", "2030", "2030", "2020", "2020", 
"2020", "2020", "2020"), scenario = c("Baseline", "BCA", "Full BCA", 
"NDC", "Partial BCA", "Baseline", "BCA", "Full BCA", "NDC", "Partial BCA", 
"Baseline", "BCA", "Full BCA", "NDC", "Partial BCA", "Baseline", 
"BCA", "Full BCA", "NDC", "Partial BCA"), value = c(50527.8708215592, 
50487.4619290311, 50485.0924261504, 50489.4453487844, 50486.1975947164, 
55845.9708589775, 55070.2745559464, 55133.107605613, 55153.4525662034, 
55065.0036253937, 61463.2383809614, 59893.8712077455, 59971.8726308887, 
59936.72156767, 59875.7762254252, 338418.917408225, 338420.617142445, 
338428.007621131, 338419.514027857, 338427.263672463)), row.names = c(NA, 
-20L), class = c("tbl_df", "tbl", "data.frame"))

CodePudding user response：

To compute based on the first in each group use, well, first. This is valid if Baseline is always the first in each group.

df<- structure(list(
  country = c("CAN", "CAN", "CAN", "CAN", "CAN", 
              "CAN", "CAN", "CAN", "CAN", "CAN", "CAN", "CAN", "CAN", "CAN", 
              "CAN", "NCOA", "NCOA", "NCOA", "NCOA", "NCOA"), 
  year = c("2020", "2020", "2020", "2020", "2020", "2025", "2025", 
           "2025", "2025", "2025", "2030", "2030", "2030", "2030", "2030", 
           "2020", "2020", "2020", "2020", "2020"), 
  scenario = c("Baseline", "BCA", "Full BCA", "NDC", "Partial BCA", 
               "Baseline", "BCA", "Full BCA", "NDC", "Partial BCA", 
               "Baseline", "BCA", "Full BCA", "NDC", "Partial BCA", "Baseline", 
               "BCA", "Full BCA", "NDC", "Partial BCA"), 
  value = c(50527.8708215592, 50487.4619290311, 50485.0924261504, 
            50489.4453487844, 50486.1975947164, 55845.9708589775, 55070.2745559464, 
            55133.107605613, 55153.4525662034, 55065.0036253937, 61463.2383809614, 
            59893.8712077455, 59971.8726308887, 59936.72156767, 59875.7762254252, 
            338418.917408225, 338420.617142445, 338428.007621131, 338419.514027857, 
            338427.263672463)), row.names = c(NA, -20L), 
  class = c("tbl_df", "tbl", "data.frame"))

suppressPackageStartupMessages(library(dplyr))

df %>%
  group_by(country, year) %>%
  mutate(growth = (value/first(value) - 1)*100)
#> # A tibble: 20 × 5
#> # Groups:   country, year [4]
#>    country year  scenario      value    growth
#>    <chr>   <chr> <chr>         <dbl>     <dbl>
#>  1 CAN     2020  Baseline     50528.  0       
#>  2 CAN     2020  BCA          50487. -0.0800  
#>  3 CAN     2020  Full BCA     50485. -0.0847  
#>  4 CAN     2020  NDC          50489. -0.0760  
#>  5 CAN     2020  Partial BCA  50486. -0.0825  
#>  6 CAN     2025  Baseline     55846.  0       
#>  7 CAN     2025  BCA          55070. -1.39    
#>  8 CAN     2025  Full BCA     55133. -1.28    
#>  9 CAN     2025  NDC          55153. -1.24    
#> 10 CAN     2025  Partial BCA  55065. -1.40    
#> 11 CAN     2030  Baseline     61463.  0       
#> 12 CAN     2030  BCA          59894. -2.55    
#> 13 CAN     2030  Full BCA     59972. -2.43    
#> 14 CAN     2030  NDC          59937. -2.48    
#> 15 CAN     2030  Partial BCA  59876. -2.58    
#> 16 NCOA    2020  Baseline    338419.  0       
#> 17 NCOA    2020  BCA         338421.  0.000502
#> 18 NCOA    2020  Full BCA    338428.  0.00269 
#> 19 NCOA    2020  NDC         338420.  0.000176
#> 20 NCOA    2020  Partial BCA 338427.  0.00247

^{Created on 2022-05-08 by the reprex package (v2.0.1)}

CodePudding user response：

You could do:

df %>% 
  group_by(country, year) %>% 
  mutate(growth = (value / value[scenario == "Baseline"] - 1) * 100)
#> # A tibble: 20 x 5
#> # Groups:   country, year [4]
#>    country year  scenario      value    growth
#>    <chr>   <chr> <chr>         <dbl>     <dbl>
#>  1 CAN     2020  Baseline     50528.  0       
#>  2 CAN     2020  BCA          50487. -0.0800  
#>  3 CAN     2020  Full BCA     50485. -0.0847  
#>  4 CAN     2020  NDC          50489. -0.0760  
#>  5 CAN     2020  Partial BCA  50486. -0.0825  
#>  6 CAN     2025  Baseline     55846.  0       
#>  7 CAN     2025  BCA          55070. -1.39    
#>  8 CAN     2025  Full BCA     55133. -1.28    
#>  9 CAN     2025  NDC          55153. -1.24    
#> 10 CAN     2025  Partial BCA  55065. -1.40    
#> 11 CAN     2030  Baseline     61463.  0       
#> 12 CAN     2030  BCA          59894. -2.55    
#> 13 CAN     2030  Full BCA     59972. -2.43    
#> 14 CAN     2030  NDC          59937. -2.48    
#> 15 CAN     2030  Partial BCA  59876. -2.58    
#> 16 NCOA    2020  Baseline    338419.  0       
#> 17 NCOA    2020  BCA         338421.  0.000502
#> 18 NCOA    2020  Full BCA    338428.  0.00269 
#> 19 NCOA    2020  NDC         338420.  0.000176
#> 20 NCOA    2020  Partial BCA 338427.  0.00247