I have a dataframe such as :
ANIMALS query
3 A
4 [CAT], [DOG] B
5 C
6 [CAT] D
7 E
8 [CAT], [SHARK], [DOG] F
and I would like to get a summary of proportion of numbers list elements between in ANIMALS
Summary expected :
ANIMALS Freq
CAT 0.50
DOG 0.33
SHARK 0.17
Here is tha dataframe in dput format if it can helps:
structure(list(ANIMALS = list(character(0),
c("[CAT]", "[DOG]"), character(0), "[CAT]",
character(0), c("[CAT]", "[SHARK]", "[DOG]"
)), query = c("A",
"B", "C",
"D", "E",
"F")), row.names = 3:8, class = "data.frame")
So far I tried to use the traditional table(tab$ANIMALS)
but it does not work.
CodePudding user response:
Here is dplyr and stringr solution:
library(dplyr)
library(stringr)
unlist(dat$ANIMALS) %>%
str_extract(., "[A-Z] " ) %>%
data.frame(ANIMALS= .) %>%
group_by(ANIMALS) %>%
summarise(total = n() ) %>%
mutate(Freq = total/sum(total))
CodePudding user response:
baseR solution
animal <- unlist(Filter(length, mydata$ANIMALS))
as.data.frame(table(animal)/length(animal))
# animal Freq
# 1 [CAT] 0.5000000
# 2 [DOG] 0.3333333
# 3 [SHARK] 0.1666667
CodePudding user response:
Another approach:
library(dplyr)
library(tidyr)
library(stringr)
df %>% select(ANIMALS) %>% unnest(ANIMALS) %>%
mutate(ANIMALS = str_remove_all(ANIMALS, '\\[|\\]'), l = n()) %>%
group_by(ANIMALS) %>% summarise(Freq = n()/l) %>% distinct()
`summarise()` has grouped output by 'ANIMALS'. You can override using the `.groups` argument.
# A tibble: 3 x 2
# Groups: ANIMALS [3]
ANIMALS Freq
<chr> <dbl>
1 CAT 0.5
2 DOG 0.333
3 SHARK 0.167
CodePudding user response:
We may also do
library(dplyr)
library(tidyr)
library(stringr)
df1 %>%
filter(lengths(ANIMALS) > 0) %>%
unnest(ANIMALS) %>%
count(ANIMALS = str_remove_all(ANIMALS, "\\[|\\]"), name = 'Freq') %>%
mutate(Freq = Freq/sum(Freq))
-output
# A tibble: 3 × 2
ANIMALS Freq
<chr> <dbl>
1 CAT 0.5
2 DOG 0.333
3 SHARK 0.167
Or in base R
proportions(table(unlist(df1$ANIMALS)))
[CAT] [DOG] [SHARK]
0.5000000 0.3333333 0.1666667