Home > Software design >  Divide a column from a dataframe by the values from a column of another dataframe, based on conditio
Divide a column from a dataframe by the values from a column of another dataframe, based on conditio

Time:05-28

I have created two dataframes:

change_df = sb_data_omit_1950 %>% 
  group_by(decade, julian_day) %>% 
  count() 

colnames(change_df) = c("decade", "julian_day", "nb_obs_perday_perdecade")

ponderate = change_df %>% 
  group_by(decade) %>% 
  summarize(sum_nb_tot = sum(nb_obs_perday_perdecade))

They look like this:

> ponderate
# A tibble: 8 x 2
  decade    sum_nb_tot
  <chr>          <int>
1 1950-1959         74
2 1960-1969        501
3 1970-1979       1495
4 1980-1989       2586
5 1990-1999       2735
6 2000-2009      15651
7 2010-2019      22678
8 2020-2029       6974

> dput(change_df[sample(nrow(change_df), 50),])
structure(list(decade = c("2020-2029", "1990-1999", "1960-1969", 
"2020-2029", "1990-1999", "2010-2019", "1970-1979", "1950-1959", 
"1950-1959", "2000-2009", "1960-1969", "1980-1989", "2020-2029", 
"2000-2009", "2000-2009", "2000-2009", "2020-2029", "1970-1979", 
"1980-1989", "2020-2029", "2000-2009", "2020-2029", "2010-2019", 
"2010-2019", "2000-2009", "1960-1969", "1960-1969", "2020-2029", 
"2000-2009", "1980-1989", "1990-1999", "2020-2029", "1980-1989", 
"2010-2019", "1960-1969", "1960-1969", "2010-2019", "1960-1969", 
"2000-2009", "2000-2009", "1990-1999", "1960-1969", "2010-2019", 
"2010-2019", "1960-1969", "2010-2019", "1990-1999", "2010-2019", 
"2000-2009", "1990-1999"), julian_day = c(119, 109, 64, 136, 
57, 115, 43, 93, 39, 97, 104, 61, 116, 105, 107, 102, 130, 151, 
25, 61, 144, 100, 16, 46, 147, 110, 54, 41, 32, 114, 115, 77, 
32, 119, 83, 100, 57, 102, 29, 16, 41, 55, 111, 94, 75, 105, 
100, 29, 39, 27), nb_obs_perday_perdecade = c(7L, 21L, 2L, 3L, 
23L, 72L, 2L, 2L, 4L, 246L, 6L, 13L, 13L, 276L, 138L, 318L, 21L, 
1L, 6L, 58L, 1L, 173L, 85L, 91L, 2L, 3L, 3L, 15L, 80L, 15L, 5L, 
58L, 9L, 53L, 11L, 6L, 155L, 7L, 100L, 47L, 15L, 1L, 145L, 474L, 
5L, 276L, 26L, 81L, 72L, 18L)), class = c("grouped_df", "tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -50L), groups = structure(list(
    decade = c("1950-1959", "1950-1959", "1960-1969", "1960-1969", 
    "1960-1969", "1960-1969", "1960-1969", "1960-1969", "1960-1969", 
    "1960-1969", "1960-1969", "1970-1979", "1970-1979", "1980-1989", 
    "1980-1989", "1980-1989", "1980-1989", "1990-1999", "1990-1999", 
    "1990-1999", "1990-1999", "1990-1999", "1990-1999", "2000-2009", 
    "2000-2009", "2000-2009", "2000-2009", "2000-2009", "2000-2009", 
    "2000-2009", "2000-2009", "2000-2009", "2000-2009", "2010-2019", 
    "2010-2019", "2010-2019", "2010-2019", "2010-2019", "2010-2019", 
    "2010-2019", "2010-2019", "2010-2019", "2020-2029", "2020-2029", 
    "2020-2029", "2020-2029", "2020-2029", "2020-2029", "2020-2029", 
    "2020-2029"), julian_day = c(39, 93, 54, 55, 64, 75, 83, 
    100, 102, 104, 110, 43, 151, 25, 32, 61, 114, 27, 41, 57, 
    100, 109, 115, 16, 29, 32, 39, 97, 102, 105, 107, 144, 147, 
    16, 29, 46, 57, 94, 105, 111, 115, 119, 41, 61, 77, 100, 
    116, 119, 130, 136), .rows = structure(list(9L, 8L, 27L, 
        42L, 3L, 45L, 35L, 36L, 38L, 11L, 26L, 7L, 18L, 19L, 
        33L, 12L, 30L, 50L, 41L, 5L, 47L, 2L, 31L, 40L, 39L, 
        29L, 49L, 10L, 16L, 14L, 15L, 21L, 25L, 23L, 48L, 24L, 
        37L, 44L, 46L, 43L, 6L, 34L, 28L, 20L, 32L, 22L, 13L, 
        1L, 17L, 4L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -50L), .drop = TRUE))

My question is: how do I divide the column "nb_obs_perday_perdecade" from change_df by the values contained in "sum_nb_tot" column from "ponderate", matching the decades between the two dataframes?

Said in a different way, I would like that the value of the divider (sum_nb_tot from ponderate) is chosen according to the decade that is associated in both change_df and ponderate.

Many thanks.

CodePudding user response:

We can do a join and then divide

library(dplyr)
change_df <- change_df %>%
   ungroup %>%
   left_join(., 
    ponderate) %>% 
   mutate(newval = nb_obs_perday_perdecade/sum_nb_tot)

-output

# A tibble: 50 × 5
   decade    julian_day nb_obs_perday_perdecade sum_nb_tot   newval
   <chr>          <dbl>                   <int>      <int>    <dbl>
 1 2020-2029        119                       7       6974 0.00100 
 2 1990-1999        109                      21       2735 0.00768 
 3 1960-1969         64                       2        501 0.00399 
 4 2020-2029        136                       3       6974 0.000430
 5 1990-1999         57                      23       2735 0.00841 
 6 2010-2019        115                      72      22678 0.00317 
 7 1970-1979         43                       2       1495 0.00134 
 8 1950-1959         93                       2         74 0.0270  
 9 1950-1959         39                       4         74 0.0541  
10 2000-2009         97                     246      15651 0.0157  
# … with 40 more rows
  • Related