Multiple variable summary with dplyr-CodePudding

I have the following DF

head(sample_data)
  article value date
1       A 21920 2015
2       I   615 2017
3       B  1414 2018
4       D   102 2018
5       I  1096 2015
6       A  2577 2021

Full dataset

dput(sample_data)
structure(list(article = c("A", "I", "B", "D", "I", "A", "C", 
"C", "D", "H", "B", "I", "A", "G", "E", "G", "D", "A", "D", "B", 
"A", "C", "D", "F", "G", "D", "G", "C", "E", "E", "G", "G", "A", 
"A", "E", "H", "B", "E", "E", "B", "B", "A", "H", "A", "B", "G", 
"D", "C", "E", "A"), value = c(21920, 615, 1414, 102, 1096, 2577, 
840, 311, 804, 695, 3863, 279, 7324, 299, 311, 133, 759, 5386, 
5396, 11051, 14708, 856, 1749, 2212, 318, 3478, 415, 781, 227, 
248, 122, 185, 1344, 15442, 248, 433, 5068, 38, 165, 369, 805, 
18944, 264, 11716, 4274, 442, 2530, 827, 164, 18506), date = c("2015", 
"2017", "2018", "2018", "2015", "2021", "2016", "2021", "2017", 
"2021", "2019", "2015", "2019", "2016", "2015", "2019", "2018", 
"2020", "2017", "2015", "2015", "2016", "2015", "2015", "2021", 
"2015", "2019", "2016", "2016", "2015", "2019", "2020", "2019", 
"2016", "2016", "2015", "2015", "2021", "2021", "2020", "2020", 
"2015", "2016", "2017", "2019", "2016", "2015", "2016", "2019", 
"2016")), row.names = c(NA, -50L), class = "data.frame")

I'm trying to use dplyr to get something along the lines of this:

sample_data %>%
    group_by(article, date) %>% 
    summarise(weight = sum(value))
`summarise()` has grouped output by 'article'. You can override using the `.groups` argument.
# A tibble: 29 x 3
# Groups:   article [9]
   article date  weight
   <chr>   <chr>  <dbl>
 1 A       2015   55572
 2 A       2016   33948
 3 A       2017   11716
 4 A       2019    8668
 5 A       2020    5386
 6 A       2021    2577
 7 B       2015   16119
 8 B       2018    1414
 9 B       2019    8137
10 B       2020    1174
# ... with 19 more rows

However, I want to add another column with a proportion of each article's weight of the total (sum of A:I) per year. The sum of all article proportions should then amount to 1 for each year.

I tried the code below. I suspect this occurs because I use "value" that results in all values being printed, hence all occurrences. How can I summarise this so it looks like the one above with the added column?

sample_data %>%
    group_by(article, date) %>% 
    summarise(weight = sum(value), prop = value/weight)
`summarise()` has grouped output by 'article', 'date'. You can override using the `.groups` argument.
# A tibble: 50 x 4
# Groups:   article, date [29]
   article date  weight  prop
   <chr>   <chr>  <dbl> <dbl>
 1 A       2015   55572 0.394
 2 A       2015   55572 0.265
 3 A       2015   55572 0.341
 4 A       2016   33948 0.455
 5 A       2016   33948 0.545
 6 A       2017   11716 1    
 7 A       2019    8668 0.845
 8 A       2019    8668 0.155
 9 A       2020    5386 1    
10 A       2021    2577 1    
# ... with 40 more rows

CodePudding user response：

After the initial summarize, you have one entry for each article per year. You then wish to know what the contribution of each article was to each year's total, so you need to group_by again using just the year, and finally mutate to get the proportion for each article.

library(dplyr)

sample_data %>%
   group_by(article, date) %>% 
   summarise(weight = sum(value), .groups = "keep") %>%
   group_by(date) %>%
   mutate(prop = weight / sum(weight))
#> # A tibble: 29 x 4
#> # Groups:   date [7]
#>    article date  weight  prop
#>    <chr>   <chr>  <dbl> <dbl>
#>  1 A       2015   55572 0.661
#>  2 A       2016   33948 0.876
#>  3 A       2017   11716 0.632
#>  4 A       2019    8668 0.491
#>  5 A       2020    5386 0.799
#>  6 A       2021    2577 0.628
#>  7 B       2015   16119 0.192
#>  8 B       2018    1414 0.622
#>  9 B       2019    8137 0.461
#> 10 B       2020    1174 0.174
#> # ... with 19 more rows

^{Created on 2022-02-19 by the reprex package (v2.0.1)}

CodePudding user response：

An option is also to have do the group by sum within first summarise

library(dplyr)
library(tibble)
library(tidyr)
sample_data %>%
  group_by(date) %>%
  summarise(out = enframe(tapply(value, article, sum)/sum(value), 
   name = 'article', value = 'prop'), .groups = 'drop') %>% 
  unpack(out)
# A tibble: 29 × 3
   date  article    prop
   <chr> <chr>     <dbl>
 1 2015  A       0.661  
 2 2015  B       0.192  
 3 2015  D       0.0923 
 4 2015  E       0.00665
 5 2015  F       0.0263 
 6 2015  H       0.00515
 7 2015  I       0.0164 
 8 2016  A       0.876  
 9 2016  C       0.0853 
10 2016  E       0.0123 
# … with 19 more rows