this is my example dataframe
example = data.frame(group = c("A", "B", "A", "A"), word = c("car", "sun ,sun, house", "car, house", "tree"))
I would like to get only unique words within group and through groups
So I would like to get this
group word
A car, tree
B sun
I used aggregate and get this
aggregate(word ~ group , data = example, FUN = paste0)
group word
1 A car, car, house, tree
2 B sun ,sun, house
but now i need to select only unique values, but even this does not work out
for (i in 1:nrow(cluster)) {cluster[i, ][["word"]] = lapply(unlist(cluster[i, ][["word"]]), unique)}
with
Error in `[[<-.data.frame`(`*tmp*`, "word", value = list("car", "car, house", :
replacement has 3 rows, data has 1
CodePudding user response:
A base R option using aggregate
subset
ave
like below
with(
aggregate(
word ~ .,
example,
function(x) {
unlist(strsplit(x, "[, ] "))
}
),
aggregate(
. ~ ind,
subset(
unique(stack(setNames(word, group))),
ave(seq_along(ind), values, FUN = length) == 1
),
c
)
)
gives
ind values
1 A car, tree
2 B sun
CodePudding user response:
Here's a dplyr
solution:
library(dplyr)
library(tidyr)
example %>%
separate_rows(word) %>%
distinct(group, word) %>%
group_by(word) %>%
filter(n() == 1) %>%
group_by(group) %>%
summarise(word = toString(word))
output
group word
1 A car, tree
2 B sun
CodePudding user response:
In base you can use strsplit
to get the words, split
them by group and use unique
the get unique words per group. Use table
to get the number of same words and take those which appear only once.
t1 <- lapply(split(strsplit(example$word, "[, ] "), example$group),
\(x) unique(unlist(x)))
t2 <- table(unlist(t1))
t2 <- names(t2)[t2 == 1]
t1 <- lapply(t1, \(x) paste(x[x %in% t2], collapse = ", "))
data.frame(group = names(t1), word=unlist(t1))
# group word
#A A car, tree
#B B sun
Or another way starting with the already used aggregate
in the question.
t1 <- aggregate(word ~ group , data = example, FUN = toString)
t2 <- lapply(strsplit(t1$word, "[, ] "), unique)
t3 <- table(unlist(t2))
t3 <- names(t3)[t3 == 1]
t1$word <- lapply(t2, \(x) x[x %in% t3])
t1
# group word
#1 A car, tree
#2 B sun
And just for fun a Benchmark
library(bench)
library(dplyr)
library(tidyr)
library(tidyverse)
example = data.frame(group = c("A", "B", "A", "A"), word = c("car", "sun ,sun, house", "car, house", "tree"))
bench::mark(check = FALSE,
GKi = {t1 <- lapply(split(strsplit(example$word, "[, ] "), example$group),
\(x) unique(unlist(x)))
t2 <- table(unlist(t1))
t2 <- names(t2)[t2 == 1]
t1 <- lapply(t1, \(x) paste(x[x %in% t2], collapse = ", "))
data.frame(group = names(t1), word=unlist(t1))},
GKi2 = {t1 <- aggregate(word ~ group , data = example, FUN = toString)
t2 <- lapply(strsplit(t1$word, "[, ] "), unique)
t3 <- table(unlist(t2))
t3 <- names(t3)[t3 == 1]
t1$word <- lapply(t2, \(x) x[x %in% t3])
t1},
ThomasIsCoding = with(
aggregate(
word ~ .,
example,
function(x) {
unlist(strsplit(x, ", "))
}
),
aggregate(
. ~ ind,
subset(
unique(stack(setNames(word, group))),
ave(seq_along(ind), values, FUN = length) == 1
),
c
)
),
Mael = {example %>%
separate_rows(word) %>%
distinct(group, word) %>%
group_by(word) %>%
filter(n() == 1) %>%
group_by(group) %>%
summarise(word = toString(word))},
"Nir Graham" = {example <- data.frame(group = c("A", "B", "A", "A"),
word = c("car", "sun ,sun, house", "car, house", "tree"))
(sep_df <- separate_rows(example,word,sep = ",") |> mutate_all(trimws) |> distinct())
(uniq_df <- sep_df|> group_by(word) |> count() |> filter(n==1))
(result_df <- inner_join(sep_df,uniq_df) |> group_by(group) |> summarise(word=paste0(word,collapse=", ")))
}
)
Result
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
1 GKi 445.13µs 486.26µs 1997. 16.03KB 6.15 974 3
2 GKi2 916.97µs 968.68µs 1023. 7.3KB 6.15 499 3
3 ThomasIsCoding 3.54ms 3.73ms 266. 8.19KB 8.45 126 4
4 Mael 16.07ms 16.48ms 60.1 60.04KB 6.68 27 3
5 Nir Graham 37.29ms 39.49ms 24.0 90.59KB 8.00 9 3
GKi is about 2 times faster than GKi2, 7 times faster than ThomasIsCoding, 30 than Mael and 80 than Nir Graham.
CodePudding user response:
library(tidyverse)
example <- data.frame(group = c("A", "B", "A", "A"),
word = c("car", "sun ,sun, house", "car, house", "tree"))
(sep_df <- separate_rows(example,word,sep = ",") |> mutate_all(trimws) |> distinct())
(uniq_df <- sep_df|> group_by(word) |> count() |> filter(n==1))
(result_df <- inner_join(sep_df,uniq_df) |> group_by(group) |> summarise(word=paste0(word,collapse=", ")))