I have created two dataframes:
change_df = sb_data_omit_1950 %>%
group_by(decade, julian_day) %>%
count()
colnames(change_df) = c("decade", "julian_day", "nb_obs_perday_perdecade")
ponderate = change_df %>%
group_by(decade) %>%
summarize(sum_nb_tot = sum(nb_obs_perday_perdecade))
They look like this:
> ponderate
# A tibble: 8 x 2
decade sum_nb_tot
<chr> <int>
1 1950-1959 74
2 1960-1969 501
3 1970-1979 1495
4 1980-1989 2586
5 1990-1999 2735
6 2000-2009 15651
7 2010-2019 22678
8 2020-2029 6974
> dput(change_df[sample(nrow(change_df), 50),])
structure(list(decade = c("2020-2029", "1990-1999", "1960-1969",
"2020-2029", "1990-1999", "2010-2019", "1970-1979", "1950-1959",
"1950-1959", "2000-2009", "1960-1969", "1980-1989", "2020-2029",
"2000-2009", "2000-2009", "2000-2009", "2020-2029", "1970-1979",
"1980-1989", "2020-2029", "2000-2009", "2020-2029", "2010-2019",
"2010-2019", "2000-2009", "1960-1969", "1960-1969", "2020-2029",
"2000-2009", "1980-1989", "1990-1999", "2020-2029", "1980-1989",
"2010-2019", "1960-1969", "1960-1969", "2010-2019", "1960-1969",
"2000-2009", "2000-2009", "1990-1999", "1960-1969", "2010-2019",
"2010-2019", "1960-1969", "2010-2019", "1990-1999", "2010-2019",
"2000-2009", "1990-1999"), julian_day = c(119, 109, 64, 136,
57, 115, 43, 93, 39, 97, 104, 61, 116, 105, 107, 102, 130, 151,
25, 61, 144, 100, 16, 46, 147, 110, 54, 41, 32, 114, 115, 77,
32, 119, 83, 100, 57, 102, 29, 16, 41, 55, 111, 94, 75, 105,
100, 29, 39, 27), nb_obs_perday_perdecade = c(7L, 21L, 2L, 3L,
23L, 72L, 2L, 2L, 4L, 246L, 6L, 13L, 13L, 276L, 138L, 318L, 21L,
1L, 6L, 58L, 1L, 173L, 85L, 91L, 2L, 3L, 3L, 15L, 80L, 15L, 5L,
58L, 9L, 53L, 11L, 6L, 155L, 7L, 100L, 47L, 15L, 1L, 145L, 474L,
5L, 276L, 26L, 81L, 72L, 18L)), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -50L), groups = structure(list(
decade = c("1950-1959", "1950-1959", "1960-1969", "1960-1969",
"1960-1969", "1960-1969", "1960-1969", "1960-1969", "1960-1969",
"1960-1969", "1960-1969", "1970-1979", "1970-1979", "1980-1989",
"1980-1989", "1980-1989", "1980-1989", "1990-1999", "1990-1999",
"1990-1999", "1990-1999", "1990-1999", "1990-1999", "2000-2009",
"2000-2009", "2000-2009", "2000-2009", "2000-2009", "2000-2009",
"2000-2009", "2000-2009", "2000-2009", "2000-2009", "2010-2019",
"2010-2019", "2010-2019", "2010-2019", "2010-2019", "2010-2019",
"2010-2019", "2010-2019", "2010-2019", "2020-2029", "2020-2029",
"2020-2029", "2020-2029", "2020-2029", "2020-2029", "2020-2029",
"2020-2029"), julian_day = c(39, 93, 54, 55, 64, 75, 83,
100, 102, 104, 110, 43, 151, 25, 32, 61, 114, 27, 41, 57,
100, 109, 115, 16, 29, 32, 39, 97, 102, 105, 107, 144, 147,
16, 29, 46, 57, 94, 105, 111, 115, 119, 41, 61, 77, 100,
116, 119, 130, 136), .rows = structure(list(9L, 8L, 27L,
42L, 3L, 45L, 35L, 36L, 38L, 11L, 26L, 7L, 18L, 19L,
33L, 12L, 30L, 50L, 41L, 5L, 47L, 2L, 31L, 40L, 39L,
29L, 49L, 10L, 16L, 14L, 15L, 21L, 25L, 23L, 48L, 24L,
37L, 44L, 46L, 43L, 6L, 34L, 28L, 20L, 32L, 22L, 13L,
1L, 17L, 4L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -50L), .drop = TRUE))
My question is: how do I divide the column "nb_obs_perday_perdecade" from change_df by the values contained in "sum_nb_tot" column from "ponderate", matching the decades between the two dataframes?
Said in a different way, I would like that the value of the divider (sum_nb_tot from ponderate) is chosen according to the decade that is associated in both change_df and ponderate.
Many thanks.
CodePudding user response:
We can do a join and then divide
library(dplyr)
change_df <- change_df %>%
ungroup %>%
left_join(.,
ponderate) %>%
mutate(newval = nb_obs_perday_perdecade/sum_nb_tot)
-output
# A tibble: 50 × 5
decade julian_day nb_obs_perday_perdecade sum_nb_tot newval
<chr> <dbl> <int> <int> <dbl>
1 2020-2029 119 7 6974 0.00100
2 1990-1999 109 21 2735 0.00768
3 1960-1969 64 2 501 0.00399
4 2020-2029 136 3 6974 0.000430
5 1990-1999 57 23 2735 0.00841
6 2010-2019 115 72 22678 0.00317
7 1970-1979 43 2 1495 0.00134
8 1950-1959 93 2 74 0.0270
9 1950-1959 39 4 74 0.0541
10 2000-2009 97 246 15651 0.0157
# … with 40 more rows