I am trying to produce multiple frequency tables that are stratified by multiple independent variables. I can get this to work for one variable and one stratification variable, but my for-loop is broken.
library(tidyverse)
# Create example dataframe of survey data
df <- data.frame(
var1 = sample(1:7, 1000, replace = TRUE),
var2 = sample(1:7, 1000, replace = TRUE),
var3 = sample(1:7, 1000, replace = TRUE),
var4 = sample(1:7, 1000, replace = TRUE),
var5 = sample(1:7, 1000, replace = TRUE),
var6 = sample(1:7, 1000, replace = TRUE),
strat1 = sample(c("A", "B", "C"), 1000, replace = TRUE),
strat2 = sample(c("X", "Y"), 1000, replace = TRUE),
strat3 = sample(c("True", "False"), 1000, replace = TRUE)
)
Example that works for one variable and one stratification variable. I want to convert this code into a for loop:
temp_df <- df %>% count(var1)
temp_df$percent <- temp_df$n / sum(temp_df$n) * 10
strat_df <- temp_df %>%
left_join((df %>% group_by(var1, strat1) %>% count(var1) %>% pivot_wider(names_from = strat1, values_from = n)), by = "var1")
for(k in c("A","B","C")){
strat_df[paste0(k, "_pct")] <- (strat_df[[k]] / temp_df$n) * 100
}
I want this same sort of output, but with added columns for count and _pct of the other two stratification variables.
I've tried using the following for loop, but it's only giving me one row per variable and it only produces two columns for each strat variable, whereas the output I'm looking for would have a raw count and column percentage column for each category within a stratification variable. Since there are 3 strat vars, two having two categories and one having three categories, my desired output would have 13 columns including the column for "v#", "n", and "percent".
# Create a list of the variables of interest
variables <- c("var1", "var2", "var3", "var4", "var5", "var6")
# Create a list of the stratification variables
strats <- c("strat1", "strat2", "strat3")
# Create a loop that runs through each variable
for(i in variables){
# Create a frequency table for the current variable
temp_df <- df %>% count(!! i)
# Add a column for the percent of responses within each response category
temp_df$percent <- temp_df$n / sum(temp_df$n) * 100
# Add a column for the raw count for each category of the stratification variables
for(j in strats){
temp_df <- temp_df %>% group_by(!!i) %>% mutate( !!j := n() )
}
# Add a column for the percent of the stratification variable category within the response category
for(j in strats){
temp_df[paste0(j, "_pct")] <- (temp_df[[j]] / temp_df$n) * 100
}
assign(paste0(i,"_df"), temp_df)
}
This is what I would like my output to look like:
CodePudding user response:
UPDATE:
Came up with a solution that outputs what I need:
for(i in variables){
j = sym(i)
temp_df <- df %>% count(!!j)
temp_df$percent <- temp_df$n / sum(temp_df$n) * 10
strat_df <- temp_df %>%
left_join((df %>% group_by(!!j, strat1) %>% count(!!j) %>% pivot_wider(names_from = strat1, values_from = n)), by = i) %>%
left_join((df %>% group_by(!!j, strat2) %>% count(!!j) %>% pivot_wider(names_from = strat2, values_from = n)), by = i) %>%
left_join((df %>% group_by(!!j, strat3) %>% count(!!j) %>% pivot_wider(names_from = strat3, values_from = n)), by = i)
for(k in c("A","B","C","X","Y","True","False")){
strat_df[paste0(k, "_pct")] <- (strat_df[[k]] / temp_df$n) * 100
}
assign(paste0(i,"_df"), strat_df)
CodePudding user response:
Either convert to sym
bol and evaluate (!!
) or use across
as the variables looped are strings
for(i in variables){
# Create a frequency table for the current variable
temp_df <- df %>% count(across(all_of(i)))
# Add a column for the percent of responses within each response category
temp_df$percent <- temp_df$n / sum(temp_df$n) * 100
# Add a column for the raw count for each category of the stratification variables
strat_df <- temp_df %>%
left_join((df %>% group_by(across(all_of(c(i, "strat1")))) %>%
count(across(all_of(i))) %>%
pivot_wider(names_from = strat1, values_from = n)), by = i) %>%
left_join((df %>% group_by(across(all_of(c(i, "strat2")))) %>%
count(across(all_of(i))) %>%
pivot_wider(names_from = strat2, values_from = n)), by = i) %>%
left_join((df %>% group_by(across(all_of(c(i, "strat3")))) %>%
count(across(all_of(i))) %>%
pivot_wider(names_from = strat3, values_from = n)), by = i)
# Add a column for the percent of the stratification variable category within the response category
for(j in c("A","B","C","X","Y","True","False")){
strat_df[paste0(j, "_pct")] <- (strat_df[[j]] / temp_df$n) * 100
}
assign(paste0(i,"_df"), strat_df)
}
-output
> var1_df
var1 n percent A B C X Y False True A_pct B_pct C_pct X_pct Y_pct True_pct False_pct
1 1 121 12.1 36 42 43 59 62 63 58 29.75207 34.71074 35.53719 48.76033 51.23967 47.93388 52.06612
2 2 144 14.4 51 42 51 84 60 69 75 35.41667 29.16667 35.41667 58.33333 41.66667 52.08333 47.91667
3 3 147 14.7 41 39 67 60 87 73 74 27.89116 26.53061 45.57823 40.81633 59.18367 50.34014 49.65986
4 4 146 14.6 52 45 49 74 72 79 67 35.61644 30.82192 33.56164 50.68493 49.31507 45.89041 54.10959
5 5 165 16.5 51 57 57 86 79 76 89 30.90909 34.54545 34.54545 52.12121 47.87879 53.93939 46.06061
6 6 133 13.3 48 51 34 64 69 68 65 36.09023 38.34586 25.56391 48.12030 51.87970 48.87218 51.12782
7 7 144 14.4 53 44 47 67 77 73 71 36.80556 30.55556 32.63889 46.52778 53.47222 49.30556 50.69444
> var2_df
var2 n percent A B C X Y False True A_pct B_pct C_pct X_pct Y_pct True_pct False_pct
1 1 152 15.2 51 53 48 79 73 70 82 33.55263 34.86842 31.57895 51.97368 48.02632 53.94737 46.05263
2 2 147 14.7 49 46 52 73 74 55 92 33.33333 31.29252 35.37415 49.65986 50.34014 62.58503 37.41497
3 3 142 14.2 46 45 51 72 70 79 63 32.39437 31.69014 35.91549 50.70423 49.29577 44.36620 55.63380
4 4 147 14.7 50 48 49 74 73 72 75 34.01361 32.65306 33.33333 50.34014 49.65986 51.02041 48.97959
5 5 128 12.8 45 43 40 59 69 72 56 35.15625 33.59375 31.25000 46.09375 53.90625 43.75000 56.25000
6 6 152 15.2 37 52 63 74 78 83 69 24.34211 34.21053 41.44737 48.68421 51.31579 45.39474 54.60526
7 7 132 13.2 54 33 45 63 69 70 62 40.90909 25.00000 34.09091 47.72727 52.27273 46.96970 53.03030