Add columns with percentages-CodePudding

The data provided shows the count of each variable (1-10) for each district. For example, let's say the total/sum of each variable value for district 1 is x (100%). Now, I want to add the percentage of each categorical variable with respect to the sum of that district in new columns (10 new columns) (say pct1, pct2 and etc.) for each district (so, sum of percentages of pct1 - pct10 would be a 100%). I would like to do this for each district.

How can I do this?

data:

df = structure(list(LONGNAME = c("District 1", "District 2", "District 3", 
"District 4", "District 5", "District 6", "District 7", "District 8", 
"District 9"), `1` = c(42065L, 49367L, 37063L, 27360L, 36424L, 
26415L, 28010L, 35001L, 30510L), `2` = c(1667L, 3598L, 3989L, 
6762L, 2125L, 2660L, 3912L, 4258L, 4089L), `3` = c(6004L, 7672L, 
9245L, 10118L, 5554L, 5520L, 7853L, 8716L, 7554L), `4` = c(6712L, 
2656L, 4700L, 6868L, 2560L, 3452L, 4838L, 4656L, 3458L), `5` = c(5626L, 
4823L, 5086L, 10852L, 4700L, 4239L, 5032L, 7864L, 5623L), `6` = c(5737L, 
9123L, 10042L, 20704L, 10119L, 13243L, 11144L, 12049L, 14064L
), `7` = c(4669L, 4338L, 3859L, 12530L, 4453L, 8099L, 3658L, 
4049L, 6360L), `8` = c(2212L, 2980L, 1647L, 11298L, 4611L, 13664L, 
1139L, 2132L, 6879L), `9` = c(302L, 830L, 700L, 6476L, 675L, 
20224L, 357L, 588L, 907L), `10` = c(14L, 25L, 12L, 1087L, 6L, 
2916L, 2L, 23L, 5L)), row.names = c(NA, -9L), class = c("tbl_df", 
"tbl", "data.frame"))

CodePudding user response：

We may use across to loop over the integer columns and apply proportions. To create the new columns, modify the .names by appending pct before the {.col} - stands for the column name

library(dplyr)
df <- df %>% 
   mutate(across(-LONGNAME, ~ proportions(.x) * 100, .names = "pct{.col}"))

-output

df
# A tibble: 9 × 21
  LONGNAME     `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`  `10`  pct1  pct2  pct3  pct4  pct5  pct6  pct7  pct8   pct9   pct10
  <chr>      <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>   <dbl>
1 District 1 42065  1667  6004  6712  5626  5737  4669  2212   302    14 13.5   5.04  8.80 16.8  10.4   5.40  8.98  4.75  0.972  0.342 
2 District 2 49367  3598  7672  2656  4823  9123  4338  2980   830    25 15.8  10.9  11.2   6.66  8.96  8.59  8.34  6.40  2.67   0.611 
3 District 3 37063  3989  9245  4700  5086 10042  3859  1647   700    12 11.9  12.1  13.5  11.8   9.45  9.45  7.42  3.54  2.25   0.293 
4 District 4 27360  6762 10118  6868 10852 20704 12530 11298  6476  1087  8.76 20.5  14.8  17.2  20.2  19.5  24.1  24.3  20.9   26.6   
5 District 5 36424  2125  5554  2560  4700 10119  4453  4611   675     6 11.7   6.43  8.14  6.42  8.73  9.53  8.56  9.90  2.17   0.147 
6 District 6 26415  2660  5520  3452  4239 13243  8099 13664 20224  2916  8.46  8.05  8.09  8.65  7.87 12.5  15.6  29.3  65.1   71.3   
7 District 7 28010  3912  7853  4838  5032 11144  3658  1139   357     2  8.97 11.8  11.5  12.1   9.35 10.5   7.03  2.45  1.15   0.0489
8 District 8 35001  4258  8716  4656  7864 12049  4049  2132   588    23 11.2  12.9  12.8  11.7  14.6  11.3   7.78  4.58  1.89   0.562 
9 District 9 30510  4089  7554  3458  5623 14064  6360  6879   907     5  9.77 12.4  11.1   8.67 10.4  13.2  12.2  14.8   2.92   0.122

If we want by row, then use the margin 1

df[paste0("pct", names(df)[-1])] <- proportions(as.matrix(df[-1]), 1) * 100

-output

> df
# A tibble: 9 × 21
  LONGNAME     `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`  `10`  pct1  pct2  pct3  pct4  pct5  pct6  pct7  pct8   pct9   pct10
  <chr>      <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>   <dbl>
1 District 1 42065  1667  6004  6712  5626  5737  4669  2212   302    14  56.1  2.22  8.00  8.95  7.50  7.65  6.22  2.95  0.403 0.0187 
2 District 2 49367  3598  7672  2656  4823  9123  4338  2980   830    25  57.8  4.21  8.98  3.11  5.65 10.7   5.08  3.49  0.972 0.0293 
3 District 3 37063  3989  9245  4700  5086 10042  3859  1647   700    12  48.5  5.23 12.1   6.16  6.66 13.2   5.05  2.16  0.917 0.0157 
4 District 4 27360  6762 10118  6868 10852 20704 12530 11298  6476  1087  24.0  5.93  8.87  6.02  9.51 18.2  11.0   9.91  5.68  0.953  
5 District 5 36424  2125  5554  2560  4700 10119  4453  4611   675     6  51.1  2.98  7.80  3.59  6.60 14.2   6.25  6.47  0.948 0.00842
6 District 6 26415  2660  5520  3452  4239 13243  8099 13664 20224  2916  26.3  2.65  5.50  3.44  4.22 13.2   8.06 13.6  20.1   2.90   
7 District 7 28010  3912  7853  4838  5032 11144  3658  1139   357     2  42.5  5.93 11.9   7.34  7.63 16.9   5.55  1.73  0.541 0.00303
8 District 8 35001  4258  8716  4656  7864 12049  4049  2132   588    23  44.1  5.37 11.0   5.87  9.91 15.2   5.10  2.69  0.741 0.0290 
9 District 9 30510  4089  7554  3458  5623 14064  6360  6879   907     5  38.4  5.15  9.51  4.35  7.08 17.7   8.01  8.66  1.14  0.00629
> rowSums(df[-(1:11)])
[1] 100 100 100 100 100 100 100 100 100