Home > other >  Group several demographic variables with dplyr
Group several demographic variables with dplyr

Time:08-12

I have the following data structure:

country age sex 
      x  10   m
      y  20   f
      x  12   m
      y  40   m

I want to group my data according to the country and get a calculation of the percentage of my sex variable according to it, resulting in some table like this:

country mean_age percent_m percent_f
      x       11      100%        0%
      y       20       50%       50%

Thank you in advance!

CodePudding user response:

A possible solution:

library(tidyverse)

df %>% 
  mutate(age = as.character(age)) %>% 
  pivot_longer(-country) %>% 
  pivot_wider(country, values_fn = ~ {if (any(str_detect(.x, "\\d")))  
      mean(as.numeric(.x)) else proportions(table(.x))["m"]}) %>% 
  mutate(mean_age = age, percent_m = sex, percent_f = 1- percent_m, age = NULL, 
    sex = NULL)

#> # A tibble: 2 × 4
#>   country mean_age percent_m percent_f
#>   <chr>      <dbl>     <dbl>     <dbl>
#> 1 x             11       1         0  
#> 2 y             30       0.5       0.5

Another possible solution:

library(tidyverse)

inner_join(
  df %>% 
    mutate(name = "mean_age") %>% 
    pivot_wider(country, values_from = age, values_fn = mean),
  df %>% 
    mutate(name = "percent_m") %>% 
    pivot_wider(country, values_from = sex, 
      values_fn = ~ proportions(table(.x))["m"])) %>% 
  mutate(percent_f = 1 - percent_m)

#> Joining, by = "country"
#> # A tibble: 2 × 4
#>   country mean_age percent_m percent_f
#>   <chr>      <dbl>     <dbl>     <dbl>
#> 1 x             11       1         0  
#> 2 y             30       0.5       0.5

CodePudding user response:

library(tidyverse)

df <- data.frame(country = c("x", "x", "x", "x", "x", "y", "y", "y"),
                 age = c(10, 20, 12, 40, 23, 17, 21, 19),
                 sex = c("m", "f", "f", "m", "f", "m", "f", "f"))

df1 <- df %>% 
  pivot_wider(names_from = sex, values_from = sex) %>% 
  group_by(country) %>% 
  summarise(age_mean = mean(age),
            m = length(na.omit(m)),
            f = length(na.omit(f))) %>% 
  mutate(m_perc = (m / (m   f)) * 100,
         f_perc = (f / (m   f)) * 100)

> df1
# A tibble: 2 x 6
  country age_mean     m     f m_perc f_perc
  <chr>      <dbl> <int> <int>  <dbl>  <dbl>
1 x             21     2     3   40     60  
2 y             19     1     2   33.3   66.7
  • Related