I have a data frame that looks like this :
id | A | B | C |
---|---|---|---|
1 | NA | dog | NA |
2 | NA | NA | NA |
3 | cat | dog | cat |
4 | NA | NA | NA |
5 | cat | dog | NA |
6 | NA | dog | dog |
7 | NA | dog | NA |
8 | NA | dog | NA |
9 | cat | cat | cat |
10 | dog | cat | dog |
I want to calculate how many counts of each variable cat and dog appear in each column and calculate their percentage BUT WITHOUT specifying to search the specific names. Ideally I want the output to be 3 tables that must look like this:
A | per | |
---|---|---|
cat | 3 | 3/4 |
dog | 1 | 1/4 |
B | per | |
---|---|---|
cat | 2 | 2/6 |
dog | 4 | 4/6 |
C | per | |
---|---|---|
cat | 2 | 2/4 |
dog | 2 | 2/4 |
How I can do that using dplyr?
id = seq(1:10)
A = c("NA","NA","cat","NA","cat","NA","NA","NA","cat","dog")
B = c("dog","NA","dog","NA","cat","dog","dog","dog","cat","cat")
C = c("NA","NA","cat","NA","NA","dog","NA","NA","cat","dog")
frame = tibble(id,A,B,C);frame
CodePudding user response:
It's bit messy
library(dplyr)
library(reshape2)
frame2 <- frame %>%
reshape2::melt(id.vars = "id") %>%
filter(value != "NA") # or na.omit()
frame2
res <- list()
l <- unique(frame2$variable)
for (i in 1:length(l)){
dd <- frame2 %>%
filter(variable == l[i]) %>%
group_by(variable, value) %>%
summarise(n = n()) %>%
mutate(freq = n/sum(n)) %>%
pivot_wider(id_cols = value, values_from = c(n, freq), names_from = variable) %>%
column_to_rownames(var = "value")
colnames(dd) <- c(as.character(l[i]), "per")
res[[i]] <- dd
}
res
[[1]]
A per
cat 3 0.75
dog 1 0.25
[[2]]
B per
cat 3 0.375
dog 5 0.625
[[3]]
C per
cat 2 0.5
dog 2 0.5
CodePudding user response:
You can do:
library(tidyverse)
frame %>% pivot_longer(!id, names_to = "group", values_to = "animal") %>%
mutate(animal = na_if(animal, "NA")) %>%
drop_na() %>%
group_by(group, animal) %>% summarise(n = n()) %>%
group_by(group) %>% mutate(per = n / sum(n))
#> `summarise()` has grouped output by 'group'. You can override using the
#> `.groups` argument.
#> # A tibble: 6 × 4
#> # Groups: group [3]
#> group animal n per
#> <chr> <chr> <int> <dbl>
#> 1 A cat 3 0.75
#> 2 A dog 1 0.25
#> 3 B cat 3 0.375
#> 4 B dog 5 0.625
#> 5 C cat 2 0.5
#> 6 C dog 2 0.5
or if you really want three separate tables, you can do
frame %>% pivot_longer(!id, names_to = "group", values_to = "animal") %>%
mutate(animal = na_if(animal, "NA")) %>%
drop_na() %>%
group_by(group, animal) %>% summarise(n = n()) %>%
group_by(group) %>% mutate(per = n / sum(n)) %>%
group_split() %>%
map(~ pivot_wider(.x, names_from = "group", values_from = "n") %>%
relocate(per, .after = last_col()))
#> `summarise()` has grouped output by 'group'. You can override using the
#> `.groups` argument.
#> [[1]]
#> # A tibble: 2 × 3
#> animal A per
#> <chr> <int> <dbl>
#> 1 cat 3 0.75
#> 2 dog 1 0.25
#>
#> [[2]]
#> # A tibble: 2 × 3
#> animal B per
#> <chr> <int> <dbl>
#> 1 cat 3 0.375
#> 2 dog 5 0.625
#>
#> [[3]]
#> # A tibble: 2 × 3
#> animal C per
#> <chr> <int> <dbl>
#> 1 cat 2 0.5
#> 2 dog 2 0.5
Created on 2022-06-17 by the reprex package (v2.0.1)