Home > Mobile >  Convert to long and make frequency table when column names are strings, R
Convert to long and make frequency table when column names are strings, R

Time:12-18

ID    What color is this item?   What color is this item?_2    What is the shape of this item? What is the shape of this item?_2          size
55    red                         blue                          circle                           triangle                                 small                                             
83    blue                        yellow                        circle                           NA                                       large
78    red                         yellow                        square                           circle                                   large
43    green                       NA                            square                           circle                                   small
29    yellow                      green                         circle                           triangle                                 medium             

I would like to get a frequency table like this:

Variable      Level        Freq        Percent 
 
color         blue          2           22.22
              red           2           22.22
              yellow        3           33.33
              green         2           22.22
              total         9           100.00

shape         circle        5           50.0       
              triangle      3           30.0
              square        2           20.0
              total         10          100.0

size          small         2           33.3
              medium        2           33.3
              large         2           33.3
              total         6           100.0

But I am having trouble matching the names of my columns when I try to convert to long since they are long strings. From a previous question, I know I can do something like:


options(digits = 3)
df1 <- df2 %>% 
  pivot_longer(
    -ID,
    names_to = "Question",
    values_to = "Response"
  ) %>% 
  mutate(Question = str_extract(Question, '')) %>% 
  group_by(Question, Response) %>% 
  count(Response, name = "Freq") %>% 
  na.omit() %>% 
  group_by(Question) %>% 
  mutate(Percent = Freq/sum(Freq)*100) %>% 
  group_split() %>% 
  adorn_totals() %>% 
  bind_rows() %>% 
  mutate(Response = ifelse(Response == last(Response), last(Question), Response)) %>% 
  mutate(Question = ifelse(duplicated(Question) |
                             Question == "Total", NA, Question))

But I'm having trouble finding the right regular expression to put in the line:

 mutate(Question = str_extract(Question, '')) %>% 

If anyone knows another way to do this that would be great as well!

CodePudding user response:

If the intention is to extract a custom list of words, we could paste the elements together to create a single string and use that as regex in str_extract

library(dplyr)
library(tidyr)
library(janitor)
library(stringr)
library(flextable)

pat_words <- c("color", "shape", "size")
out <-  df %>% 
  pivot_longer(
    -ID,
    names_to = "Question",
    values_to = "Response"
  ) %>% mutate(Question = str_extract(Question, str_c(pat_words, collapse="|"))) %>% group_by(Question, Response) %>% 
  count(Response, name = "Freq") %>% 
  na.omit() %>% 
  group_by(Question) %>% 
  mutate(Percent = round(Freq/sum(Freq)*100, 2)) %>% 
  group_split() %>% 
  adorn_totals() %>% 
  bind_rows() %>% 
  mutate(Response = ifelse(Response == last(Response), last(Question), Response)) %>% 
  mutate(Question = ifelse(duplicated(Question) |
                             Question == "Total", NA, Question)) %>% 
  as.data.frame
flextable(out)

-output

enter image description here

data

df <- structure(list(ID = c(55L, 83L, 78L, 43L, 29L), `What color is this item?` = c("red", 
"blue", "red", "green", "yellow"), `What color is this item?_2` = c("blue", 
"yellow", "yellow", NA, "green"), `What is the shape of this item?` = c("circle", 
"circle", "square", "square", "circle"), `What is the shape of this item?_2` = c("triangle", 
NA, "circle", "circle", "triangle"), size = c("small", "large", 
"large", "small", "medium")), class = "data.frame", row.names = c(NA, 
-5L))

CodePudding user response:

First of all, you should use more proper names for coding.

names(dat)[2:5] <- paste0(rep(c('color.', 'shape.'), each=2), 1:2)

Now we easily may bring the data into long format.

dat_l <- reshape(dat, 2:5, direction='long', idvar='ID')

After that we could use the table() function in base R and its relatives,

vars <- names(dat_l)[c("size", "color", "shape")]
tbl <- lapply(vars, \(x) table(dat_l[, x]) |> 
                (\(Freq) cbind(Freq=addmargins(Freq), 
                               Percent=addmargins(proportions(Freq))*100))() |>
                round(2)) |> 
  setNames(vars)

to get a nice table for the console.

tbl
# $size
#        Freq Percent
# large     4      40
# medium    2      20
# small     4      40
# Sum      10     100
# 
# $color
#        Freq Percent
# blue      2   22.22
# green     2   22.22
# red       2   22.22
# yellow    3   33.33
# Sum       9  100.00
# 
# $shape
#          Freq Percent
# circle      5   55.56
# square      2   22.22
# triangle    2   22.22
# Sum         9  100.00

# [1] "R version 4.1.2 (2021-11-01)"

Data

dat <- structure(list(ID = c(55L, 83L, 78L, 43L, 29L), What.color.is.this.item. = c("red", 
"blue", "red", "green", "yellow"), What.color.is.this.item._2 = c("blue", 
"yellow", "yellow", NA, "green"), What.is.the.shape.of.this.item. = c("circle", 
"circle", "square", "square", "circle"), What.is.the.shape.of.this.item._2 = c("triangle", 
NA, "circle", "circle", "triangle"), size = c("small", "large", 
"large", "small", "medium")), class = "data.frame", row.names = c(NA, 
-5L))

CodePudding user response:

Its necessary to make assumptions about the contents of the columns (appl), i.e. give the important keywords.

Then create a dataframe based on the columns

appl <- sapply( c("color","shape","size"), function(x) grep(x, colnames(dat)) )

data.frame( do.call( rbind, sapply( seq_along(appl), function(x){
  tbl <- table(unlist( dat[,appl[[x]]] )); 
  rbind( cbind( Variable=names(appl[x]), Freq=tbl, Percent=round( tbl/sum(tbl)*100, digits=2 ) ), 
  cbind( Variable=names(appl[x]), sum(tbl), sum(tbl/sum(tbl)*100) ) ) }  ) ) )

         Variable Freq Percent
blue        color    2   22.22
green       color    2   22.22
red         color    2   22.22
yellow      color    3   33.33
X           color    9     100
circle      shape    5   55.56
square      shape    2   22.22
triangle    shape    2   22.22
X.1         shape    9     100
large        size    2      40
medium       size    1      20
small        size    2      40
X.2          size    5     100

Data

dat <- structure(list(ID = c(55L, 83L, 78L, 43L, 29L), What.color.is.this.item. = c("red", 
"blue", "red", "green", "yellow"), What.color.is.this.item._2 = c("blue", 
"yellow", "yellow", NA, "green"), What.is.the.shape.of.this.item. = c("circle", 
"circle", "square", "square", "circle"), What.is.the.shape.of.this.item._2 = c("triangle", 
NA, "circle", "circle", "triangle"), size = c("small", "large", 
"large", "small", "medium")), class = "data.frame", row.names = c(NA, 
-5L))
  •  Tags:  
  • r
  • Related