Show count frequency and proportion of total count of each group level individually in R-CodePudding

I have a df that looks like this:

head(df)
site   year   common_name
1      2019   Silverside
1      2019   Silverside
1      2019   Sailfin molly
2      2019   Silverside
1      2020   Silverside

I like the function 'aggregate', but not sure how to summarize the frequency of occurrence and proportion of the total for each individual level of a group (for each species). My ultimate goal looks like this:

site  year   total_fish  total_Silverside  %_Silverside  ...etc. (all other 'common_name's)
1     2019   3           2                 67%
2     2019   1           1                 100%
1     2020   1           1                 100%

CodePudding user response：

{dplyr} can do it.

library(tidyverse)
text <- "site,   year,   common_name
1,      2019,   Silverside
1,      2019,   Silverside
1,      2019,   Sailfin molly
2,      2019,   Silverside
1,      2020,   Silverside"
data <- read.table(textConnection(text), sep = ",", header = TRUE) %>%
  as_tibble() %>%
  mutate_all(trimws)

absolutes <- data %>%
  group_by(site, year) %>%
  summarize(total = n(), as_tibble(table(common_name))) %>%
  spread(common_name, n, fill = 0) %>%
  ungroup()
#> `summarise()` has grouped output by 'site', 'year'. You can override using the `.groups` argument.

shares <- absolutes %>%
  mutate(., across(data$common_name, ~ .x / !!.$total)) %>%
  rename_with(~ paste0("share ", .x), data$common_name)

inner_join(absolutes, shares) %>%
  rename_with(~ paste0("absolute ", .x), data$common_name)
#> Joining, by = c("site", "year", "total")
#> # A tibble: 3 × 7
#>   site  year  total `absolute Sailfin molly` `absolute Silver… `share Sailfin m…
#>   <chr> <chr> <int>                    <dbl>             <dbl>             <dbl>
#> 1 1     2019      3                        1                 2             0.333
#> 2 1     2020      1                        0                 1             0    
#> 3 2     2019      1                        0                 1             0    
#> # … with 1 more variable: share Silverside <dbl>

^{Created on 2021-12-09 by the reprex package (v2.0.1)}

CodePudding user response：

Another possible solution:

library(tidyverse)


df %>%
  pivot_wider(id_cols = c(site, year), names_from = common_name, 
    values_from = common_name, values_fn = length, names_prefix = "total_",
      values_fill = 0) %>%
  mutate(tot = rowSums(select(.,starts_with("total_")))) %>% 
  mutate(across(
    starts_with("total_"), ~100* .x /tot, .names="perc_{str_remove(col,'total_')}")) %>% as.data.frame()

#>   site year total_Silverside total_Sailfin molly tot perc_Silverside
#> 1    1 2019                2                   1   3        66.66667
#> 2    2 2019                1                   0   1       100.00000
#> 3    1 2020                1                   0   1       100.00000
#>   perc_Sailfin molly
#> 1           33.33333
#> 2            0.00000
#> 3            0.00000