Frequency table when there are multiple columns representing one value (R)-CodePudding

I have a dataset like this:

ID    color1   color2  color3   shape1       shape2        size
55    red     blue     NA       circle       triangle      small
67    yellow  NA       NA       triangle     NA            medium
83    blue    yellow   NA       circle       NA            large
78    red     yellow   blue     square       circle        large
43    green   NA       NA       square       circle        small
29    yellow  green    NA       circle       triangle      medium

I would like to create a dataframe where I have the frequency and percentage of each variable, but I am having trouble because there are multiple columns of the same variable in some cases.


Variable      Level        Freq        Percent 
 
color         blue          3           27.27
              red           2           18.18
              yellow        4           36.36
              green         2           18.18
              total         11          100.00

shape         circle        5           50.0       
              triangle      3           30.0
              square        2           20.0
              total         10          100.0

size          small         2           33.3
              medium        2           33.3
              large         2           33.3
              total         6           100.0

I believe that I need to convert these variables to long and then use summarize/mutate to get the frequencies, but I can't seem to figure it out. Any help is greatly appreciated.

CodePudding user response：

You can use tidyverse package to transform the data into a long format and then just summarise the desired stats.

library(tidyverse)

df |> 
  # Transform all columns into a long format
  pivot_longer(cols = -ID,
               names_pattern = "([A-z] )",
               names_to = c("variable")) |>
  # Drop NA entries
  drop_na(value) |>
  # Group by variable
  group_by(variable) |>
  # Count
  count(value) |>
  # Calculate percentage as n / sum of n by variable
  mutate(perc = 100* n / sum(n))

# A tibble: 10 x 4
# Groups:   variable [3]
#   variable value        n  perc
#   <chr>    <chr>    <int> <dbl>
# 1 color    blue         3  27.3
# 2 color    green        2  18.2
# 3 color    red          2  18.2
# 4 color    yellow       4  36.4
# 5 shape    circle       5  50  
# 6 shape    square       2  20  
# 7 shape    triangle     3  30  
# 8 size     large        2  33.3
# 9 size     medium       2  33.3
#10 size     small        2  33.3

CodePudding user response：

Combining and adding to:


library(dplyr)
library(tidyr)
library(janitor)

options(digits = 3)

df %>% 
  pivot_longer(
    -ID,
    names_to = "Variable",
    values_to = "Level"
  ) %>% 
  mutate(Variable = str_extract(Variable, '[A-Za-z]*')) %>% 
  group_by(Variable, Level) %>% 
  count(Level, name = "Freq") %>% 
  na.omit() %>% 
  group_by(Variable) %>% 
  mutate(Percent = Freq/sum(Freq)*100) %>% 
  group_split() %>% 
  adorn_totals() %>% 
  bind_rows() %>% 
  mutate(Level = ifelse(Level == last(Level), last(Variable), Level)) %>% 
  mutate(Variable = ifelse(duplicated(Variable) |
                             Variable == "Total", NA, Variable))

 Variable    Level Freq Percent
    color     blue    3    27.3
     <NA>    green    2    18.2
     <NA>      red    2    18.2
     <NA>   yellow    4    36.4
     <NA>    Total   11   100.0
    shape   circle    5    50.0
     <NA>   square    2    20.0
     <NA> triangle    3    30.0
     <NA>    Total   10   100.0
     size    large    2    33.3
     <NA>   medium    2    33.3
     <NA>    small    2    33.3
     <NA>    Total    6   100.0

CodePudding user response：

Try this matrix in list in base R

uniq <- unique( sub( "[0-9]","", colnames(dat[,-1]) ) )
uniq
[1] "color" "shape" "size"

sapply( uniq, function(x){ tbl <- table( unlist( dat[,grep( x, colnames(dat) )] ) ); 
  rbind( cbind( Percent=round( tbl/sum(tbl)*100, digits=2 ), Freq=tbl ), 
         cbind( sum(tbl/sum(tbl)*100), sum(tbl) ) ) } )
$color
         Percent Freq
blue       27.27    3
green      18.18    2
red        18.18    2
yellow     36.36    4
          100.00   11

$shape
         Percent Freq
circle        50    5
square        20    2
triangle      30    3
             100   10

$size
         Percent Freq
large      33.33    2
medium     33.33    2
small      33.33    2
          100.00    6

Data

dat <- structure(list(ID = c(55L, 67L, 83L, 78L, 43L, 29L), color1 = c("red", 
"yellow", "blue", "red", "green", "yellow"), color2 = c("blue", 
NA, "yellow", "yellow", NA, "green"), color3 = c(NA, NA, NA, 
"blue", NA, NA), shape1 = c("circle", "triangle", "circle", "square", 
"square", "circle"), shape2 = c("triangle", NA, NA, "circle", 
"circle", "triangle"), size = c("small", "medium", "large", "large", 
"small", "medium")), class = "data.frame", row.names = c(NA, 
-6L))