sum and count value based on two criteria-CodePudding

I'm having trouble solving the following problem.

Geslacht               persondays age_cat contactfirst         
 1 V                          365 <40     2020            
 2 V                          365 <40     2019            
 3 V                          365 70-80   2019            
 4 V                          365 50-60   2019            
 5 V                          365 60-70   2020            
 6 M                          365 50-60   2020            
 7 V                          365 60-70   2019            
 8 M                           39 60-70   2019            
 9 V                          365 60-70   2019            
10 M                          365 70-80   2020            

df <- structure(list(Geslacht = c("V", "V", "V", "V", "V", "M", "V", 
"M", "V", "M", "M", "M", "V", "M", "M", "V", "V", "M", "V", "M", 
"V", "V", "M", "V", "M", "M", "M", "M", "M", "V", "M", "V", "M", 
"V", "M", "M", "V", "M", "M", "M", "M", "V", "M", "V", "M", "M", 
"M", "M", "M", "V"), persondays_individual = c(365, 365, 365, 
365, 365, 365, 365, 39, 365, 365, 365, 365, 365, 365, 365, 365, 
365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 
365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 
365, 365, 365, 365, 365, 365, 365, 365), age_cat = structure(c(1L, 
1L, 6L, 4L, 5L, 4L, 5L, 5L, 5L, 6L, 1L, 5L, 4L, 6L, 5L, 7L, 6L, 
3L, 5L, 5L, 5L, 7L, 5L, 6L, 4L, 4L, 4L, 1L, 6L, 6L, 4L, 7L, 7L, 
4L, 3L, 4L, 5L, 5L, 1L, 4L, 6L, 6L, 5L, 5L, 4L, 3L, 7L, 5L, 5L, 
4L), .Label = c("<40", ">90", "40-50", "50-60", "60-70", "70-80", 
"80-90"), class = "factor"), contactfirst_cat = structure(c(11L, 
10L, 10L, 10L, 11L, 11L, 10L, 10L, 10L, 11L, 11L, 10L, 11L, 10L, 
11L, 10L, 10L, 10L, 10L, 10L, 11L, 10L, 11L, 11L, 11L, 10L, 11L, 
10L, 11L, 11L, 10L, 11L, 11L, 11L, 10L, 10L, 10L, 11L, 11L, 10L, 
10L, 11L, 11L, 11L, 10L, 10L, 10L, 10L, 11L, 11L), .Label = c("<2011", 
"2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", 
"2019", "2020"), class = "factor")), row.names = c(NA, -50L), class = c("tbl_df", 
"tbl", "data.frame"))

I would like to create two new variables. One that sums the persons days per age_cat and Gender. And the second that counts the amount of the 'contactfirst' in a certain year (in this case 2019)

Desired output:

Gender       age_cat    persondays_total    contactfirst_total
  V            <40           730            1
  V            50-60         365            1
  V            60-70         1095           2
  V            70-80         365            1
  M            50-60         365            0
  M            60-70         39             1
  M            70-80         365            0

I've tried to do it with group_by (following code). But this counts all the persondays in the dataframe (so not per gender & age_category) and does not create the column newcontacts.

df2 <- df %>% group_by(Gender, age_cat) %>%
         mutate(personyears_total = sum(persondays_individual)) %>%
         mutate(newcontacts = nrow(df$contactfirst == "2019"

Any help would be much appreciated.

CodePudding user response：

You should use summarize instead of mutate, since you are creating some summaries for each group. Also, use sum(contactfirst_cat == "2019") to count records meeting certain criteria instead of nrow(), which counts the number of rows in a dataframe.

library(dplyr)

df %>% group_by(Geslacht, age_cat) %>% 
  summarize(persondays_total = sum(persondays_individual),
            contactfirst_total = sum(contactfirst_cat == "2019"),
            .groups = "drop")

# A tibble: 11 × 4
   Geslacht age_cat persondays_total contactfirst_total
   <chr>    <fct>              <dbl>              <int>
 1 M        <40                 1095                  1
 2 M        40-50               1095                  3
 3 M        50-60               2920                  5
 4 M        60-70               2959                  4
 5 M        70-80               1460                  2
 6 M        80-90                730                  1
 7 V        <40                  730                  1
 8 V        50-60               1460                  1
 9 V        60-70               2555                  4
10 V        70-80               1825                  2
11 V        80-90               1095                  2

Data

df <- structure(list(Geslacht = c("V", "V", "V", "V", "V", "M", "V", 
"M", "V", "M", "M", "M", "V", "M", "M", "V", "V", "M", "V", "M", 
"V", "V", "M", "V", "M", "M", "M", "M", "M", "V", "M", "V", "M", 
"V", "M", "M", "V", "M", "M", "M", "M", "V", "M", "V", "M", "M", 
"M", "M", "M", "V"), persondays_individual = c(365, 365, 365, 
365, 365, 365, 365, 39, 365, 365, 365, 365, 365, 365, 365, 365, 
365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 
365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 365, 
365, 365, 365, 365, 365, 365, 365, 365), age_cat = structure(c(1L, 
1L, 6L, 4L, 5L, 4L, 5L, 5L, 5L, 6L, 1L, 5L, 4L, 6L, 5L, 7L, 6L, 
3L, 5L, 5L, 5L, 7L, 5L, 6L, 4L, 4L, 4L, 1L, 6L, 6L, 4L, 7L, 7L, 
4L, 3L, 4L, 5L, 5L, 1L, 4L, 6L, 6L, 5L, 5L, 4L, 3L, 7L, 5L, 5L, 
4L), .Label = c("<40", ">90", "40-50", "50-60", "60-70", "70-80", 
"80-90"), class = "factor"), contactfirst_cat = structure(c(11L, 
10L, 10L, 10L, 11L, 11L, 10L, 10L, 10L, 11L, 11L, 10L, 11L, 10L, 
11L, 10L, 10L, 10L, 10L, 10L, 11L, 10L, 11L, 11L, 11L, 10L, 11L, 
10L, 11L, 11L, 10L, 11L, 11L, 11L, 10L, 10L, 10L, 11L, 11L, 10L, 
10L, 11L, 11L, 11L, 10L, 10L, 10L, 10L, 11L, 11L), .Label = c("<2011", 
"2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", 
"2019", "2020"), class = "factor")), row.names = c(NA, -50L), class = c("tbl_df", 
"tbl", "data.frame"))

CodePudding user response：

library(data.table)
setDT(df)
df[, .(persondays_total = sum(persondays_individual),
       contactfirst_total = sum(contactfirst_cat == 2019)),
     keyby = .(Geslacht, age_cat)]
#    Geslacht age_cat persondays_total contactfirst_total
# 1:        M     <40             1095                  1
# 2:        M   40-50             1095                  3
# 3:        M   50-60             2920                  5
# 4:        M   60-70             2959                  4
# 5:        M   70-80             1460                  2
# 6:        M   80-90              730                  1
# 7:        V     <40              730                  1
# 8:        V   50-60             1460                  1
# 9:        V   60-70             2555                  4
#10:        V   70-80             1825                  2
#11:        V   80-90             1095                  2

CodePudding user response：

If you don't want to bother with packages, try aggregate.

aggregate(cbind(persondays_individual, contactfirst_cat) ~ age_cat   Geslacht, 
          transform(df, contactfirst_cat=contactfirst_cat == 2019), sum)
#    age_cat Geslacht persondays_individual contactfirst_cat
# 1      <40        M                  1095                1
# 2    40-50        M                  1095                3
# 3    50-60        M                  2920                5
# 4    60-70        M                  2959                4
# 5    70-80        M                  1460                2
# 6    80-90        M                   730                1
# 7      <40        V                   730                1
# 8    50-60        V                  1460                1
# 9    60-70        V                  2555                4
# 10   70-80        V                  1825                2
# 11   80-90        V                  1095                2