Example Dataframe
structure(list(sex = c("Male", "Female", "Female", "Female",
"Male", "Female", "Female", "Male", "Female"), cigarettes_smoking_status = c("Non-smoker",
"Non-smoker", "Non-smoker", "Non-smoker", "Non-smoker", "Non-smoker",
"Non-smoker", "Regular Smoker", "Non-smoker")), row.names = 2:10, class = "data.frame")
Code
smoking_status_by_per <- smoking_dataset %>%
group_by(cigarettes_smoking_status, sex) %>%
dplyr::summarise(count1=n()) %>%
mutate(percentage=(count1/sum(count1))*100) %>%
pivot_wider(names_from = sex, values_from = percentage) %>%
group_by(cigarettes_smoking_status)
The problem
I am having difficulty producing a percentage table in R that is condensed to 4 rows (Occasional smokers, Non-smokers, regular smokers and Prefer not to say) that clearly shows the percentage in each category by sex. Ideally, I am looking to produce a table in R that looks like thisHow I want the table to look
I have been attempting to use janitor::tabyl and pivot_wider to condense the rows, so there are just 4 rows. One row for Regular smokers. One row for occasional smokers etc. This is what my current output looks like.
CodePudding user response:
smoking_status_by_per %>%
# generate counts
janitor::tabyl(cigarettes_smoking_status, sex) %>%
# add total row/column
janitor::adorn_totals(where = c('row', 'col')) %>%
# convert counts to percentages
janitor::adorn_percentages() %>%
janitor::adorn_pct_formatting()
cigarettes_smoking_status Female Male Total
Non-smoker 75.0% 25.0% 100.0%
Regular Smoker 0.0% 100.0% 100.0%
Total 66.7% 33.3% 100.0%
This does convert the totals to percentages. You can use janitor::adorn_ns
to add back counts to the percentages as well. Or save the totals after calculating the totals and add them back to the table afterwards (rbind the last row and cbind the Totals column with the counts).
CodePudding user response:
We can use proportions
and some binding to get what you have in the example.
Starting with enough data to fill out the matrix,
set.seed(42)
quux <- data.frame(response = sample(c("Non-smoker", "Occasional smoker", "Prefer not to say", "Regular smoker"), size=5000, replace=TRUE), gender = sample(c("Male", "Female", "Prefer not to say", "Unknown"), size=5000, replace=TRUE))
head(quux)
# response gender
# 1 Non-smoker Unknown
# 2 Non-smoker Prefer not to say
# 3 Non-smoker Female
# 4 Non-smoker Unknown
# 5 Occasional smoker Female
# 6 Regular smoker Unknown
base R
We can look at a simple table with:
table(quux)
# gender
# response Female Male Prefer not to say Unknown
# Non-smoker 330 294 323 312
# Occasional smoker 308 344 287 325
# Prefer not to say 292 337 310 304
# Regular smoker 309 308 311 306
For future verification, the sum of the first column (Female
) is 1239, and the expected column-wise percentages for that are
c(330, 308, 292, 309) / 1239
# [1] 0.2663438 0.2485876 0.2356739 0.2493947
We can get the percentages with
round(100 * proportions(table(quux), margin = 2), 2)
# gender
# response Female Male Prefer not to say Unknown
# Non-smoker 26.63 22.92 26.24 25.02
# Occasional smoker 24.86 26.81 23.31 26.06
# Prefer not to say 23.57 26.27 25.18 24.38
# Regular smoker 24.94 24.01 25.26 24.54
Do get the right-most (Total) and bottom summary, we'll need to bind things.
tbl1 <- do.call(table, quux)
tbl2 <- 100 * proportions(tbl1, margin = 2)
tbl3 <- rbind(tbl2, `Number of Respondents` = colSums(tbl1))
tbl3
# Female Male Prefer not to say Unknown
# Non-smoker 26.63438 22.91504 26.23883 25.02005
# Occasional smoker 24.85876 26.81216 23.31438 26.06255
# Prefer not to say 23.56739 26.26656 25.18278 24.37851
# Regular smoker 24.93947 24.00624 25.26401 24.53889
# Number of Respondents 1239.00000 1283.00000 1231.00000 1247.00000
tbl4 <- cbind(tbl3, `Total %` = c(100 * proportions(rowSums(tbl1)), sum(tbl1)))
tbl4
# Female Male Prefer not to say Unknown Total %
# Non-smoker 26.63438 22.91504 26.23883 25.02005 25.18
# Occasional smoker 24.85876 26.81216 23.31438 26.06255 25.28
# Prefer not to say 23.56739 26.26656 25.18278 24.37851 24.86
# Regular smoker 24.93947 24.00624 25.26401 24.53889 24.68
# Number of Respondents 1239.00000 1283.00000 1231.00000 1247.00000 5000.00
And we can round the numbers:
round(tbl4, 1)
# Female Male Prefer not to say Unknown Total %
# Non-smoker 26.6 22.9 26.2 25.0 25.2
# Occasional smoker 24.9 26.8 23.3 26.1 25.3
# Prefer not to say 23.6 26.3 25.2 24.4 24.9
# Regular smoker 24.9 24.0 25.3 24.5 24.7
# Number of Respondents 1239.0 1283.0 1231.0 1247.0 5000.0
dplyr
library(dplyr)
library(tidyr) # pivot_wider
tbl1 <- tibble(quux) %>%
count(response, gender) %>%
pivot_wider(response, names_from = gender, values_from = n)
tbl1
# # A tibble: 4 × 5
# response Female Male `Prefer not to say` Unknown
# <chr> <int> <int> <int> <int>
# 1 Non-smoker 330 294 323 312
# 2 Occasional smoker 308 344 287 325
# 3 Prefer not to say 292 337 310 304
# 4 Regular smoker 309 308 311 306
tbl2 <- tbl1 %>%
summarize(
response = "Number of Respondents",
across(-response, ~ sum(.)),
`Total %` = sum(tbl1[,-1])
)
tbl2
# # A tibble: 1 × 6
# response Female Male `Prefer not to say` Unknown `Total %`
# <chr> <int> <int> <int> <int> <int>
# 1 Number of Respondents 1239 1283 1231 1247 5000
tbl1 %>%
mutate(
across(Female:Unknown, ~ 100 * . / sum(.)),
`Total %` = rowSums(tbl1[,-1]),
`Total %` = 100 * `Total %` / sum(`Total %`)
) %>%
bind_rows(tbl2)
# # A tibble: 5 × 6
# response Female Male `Prefer not to say` Unknown `Total %`
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Non-smoker 26.6 22.9 26.2 25.0 25.2
# 2 Occasional smoker 24.9 26.8 23.3 26.1 25.3
# 3 Prefer not to say 23.6 26.3 25.2 24.4 24.9
# 4 Regular smoker 24.9 24.0 25.3 24.5 24.7
# 5 Number of Respondents 1239 1283 1231 1247 5000