I have the following DF
head(sample_data)
article value date
1 A 21920 2015
2 I 615 2017
3 B 1414 2018
4 D 102 2018
5 I 1096 2015
6 A 2577 2021
Full dataset
dput(sample_data)
structure(list(article = c("A", "I", "B", "D", "I", "A", "C",
"C", "D", "H", "B", "I", "A", "G", "E", "G", "D", "A", "D", "B",
"A", "C", "D", "F", "G", "D", "G", "C", "E", "E", "G", "G", "A",
"A", "E", "H", "B", "E", "E", "B", "B", "A", "H", "A", "B", "G",
"D", "C", "E", "A"), value = c(21920, 615, 1414, 102, 1096, 2577,
840, 311, 804, 695, 3863, 279, 7324, 299, 311, 133, 759, 5386,
5396, 11051, 14708, 856, 1749, 2212, 318, 3478, 415, 781, 227,
248, 122, 185, 1344, 15442, 248, 433, 5068, 38, 165, 369, 805,
18944, 264, 11716, 4274, 442, 2530, 827, 164, 18506), date = c("2015",
"2017", "2018", "2018", "2015", "2021", "2016", "2021", "2017",
"2021", "2019", "2015", "2019", "2016", "2015", "2019", "2018",
"2020", "2017", "2015", "2015", "2016", "2015", "2015", "2021",
"2015", "2019", "2016", "2016", "2015", "2019", "2020", "2019",
"2016", "2016", "2015", "2015", "2021", "2021", "2020", "2020",
"2015", "2016", "2017", "2019", "2016", "2015", "2016", "2019",
"2016")), row.names = c(NA, -50L), class = "data.frame")
I'm trying to use dplyr to get something along the lines of this:
sample_data %>%
group_by(article, date) %>%
summarise(weight = sum(value))
`summarise()` has grouped output by 'article'. You can override using the `.groups` argument.
# A tibble: 29 x 3
# Groups: article [9]
article date weight
<chr> <chr> <dbl>
1 A 2015 55572
2 A 2016 33948
3 A 2017 11716
4 A 2019 8668
5 A 2020 5386
6 A 2021 2577
7 B 2015 16119
8 B 2018 1414
9 B 2019 8137
10 B 2020 1174
# ... with 19 more rows
However, I want to add another column with a proportion of each article's weight of the total (sum of A:I) per year. The sum of all article proportions should then amount to 1 for each year.
I tried the code below. I suspect this occurs because I use "value" that results in all values being printed, hence all occurrences. How can I summarise this so it looks like the one above with the added column?
sample_data %>%
group_by(article, date) %>%
summarise(weight = sum(value), prop = value/weight)
`summarise()` has grouped output by 'article', 'date'. You can override using the `.groups` argument.
# A tibble: 50 x 4
# Groups: article, date [29]
article date weight prop
<chr> <chr> <dbl> <dbl>
1 A 2015 55572 0.394
2 A 2015 55572 0.265
3 A 2015 55572 0.341
4 A 2016 33948 0.455
5 A 2016 33948 0.545
6 A 2017 11716 1
7 A 2019 8668 0.845
8 A 2019 8668 0.155
9 A 2020 5386 1
10 A 2021 2577 1
# ... with 40 more rows
CodePudding user response:
After the initial summarize
, you have one entry for each article per year. You then wish to know what the contribution of each article was to each year's total, so you need to group_by
again using just the year, and finally mutate
to get the proportion for each article.
library(dplyr)
sample_data %>%
group_by(article, date) %>%
summarise(weight = sum(value), .groups = "keep") %>%
group_by(date) %>%
mutate(prop = weight / sum(weight))
#> # A tibble: 29 x 4
#> # Groups: date [7]
#> article date weight prop
#> <chr> <chr> <dbl> <dbl>
#> 1 A 2015 55572 0.661
#> 2 A 2016 33948 0.876
#> 3 A 2017 11716 0.632
#> 4 A 2019 8668 0.491
#> 5 A 2020 5386 0.799
#> 6 A 2021 2577 0.628
#> 7 B 2015 16119 0.192
#> 8 B 2018 1414 0.622
#> 9 B 2019 8137 0.461
#> 10 B 2020 1174 0.174
#> # ... with 19 more rows
Created on 2022-02-19 by the reprex package (v2.0.1)
CodePudding user response:
An option is also to have do the group by sum within first summarise
library(dplyr)
library(tibble)
library(tidyr)
sample_data %>%
group_by(date) %>%
summarise(out = enframe(tapply(value, article, sum)/sum(value),
name = 'article', value = 'prop'), .groups = 'drop') %>%
unpack(out)
# A tibble: 29 × 3
date article prop
<chr> <chr> <dbl>
1 2015 A 0.661
2 2015 B 0.192
3 2015 D 0.0923
4 2015 E 0.00665
5 2015 F 0.0263
6 2015 H 0.00515
7 2015 I 0.0164
8 2016 A 0.876
9 2016 C 0.0853
10 2016 E 0.0123
# … with 19 more rows