unique words by group-CodePudding

this is my example dataframe

example = data.frame(group = c("A", "B", "A", "A"), word = c("car", "sun ,sun, house", "car, house", "tree"))

I would like to get only unique words within group and through groups

So I would like to get this

group   word
A       car, tree
B       sun

I used aggregate and get this

aggregate(word ~ group , data = example,  FUN = paste0) 

  group                  word
1     A car, car, house, tree
2     B       sun ,sun, house

but now i need to select only unique values, but even this does not work out

for (i in 1:nrow(cluster)) {cluster[i, ][["word"]] = lapply(unlist(cluster[i, ][["word"]]), unique)}

with

Error in `[[<-.data.frame`(`*tmp*`, "word", value = list("car", "car, house",  : 
  replacement has 3 rows, data has 1

CodePudding user response：

A base R option using aggregate subset ave like below

with(
  aggregate(
    word ~ .,
    example,
    function(x) {
      unlist(strsplit(x, "[, ] "))
    }
  ),
  aggregate(
    . ~ ind,
    subset(
      unique(stack(setNames(word, group))),
      ave(seq_along(ind), values, FUN = length) == 1
    ),
    c
  )
)

gives

  ind    values
1   A car, tree
2   B       sun

CodePudding user response：

Here's a dplyr solution:

library(dplyr)
library(tidyr)
example %>% 
  separate_rows(word) %>% 
  distinct(group, word) %>% 
  group_by(word) %>% 
  filter(n() == 1) %>% 
  group_by(group) %>% 
  summarise(word = toString(word))

output

  group word       
1 A     car, tree
2 B     sun

CodePudding user response：

In base you can use strsplit to get the words, split them by group and use unique the get unique words per group. Use table to get the number of same words and take those which appear only once.

t1 <- lapply(split(strsplit(example$word, "[, ] "), example$group),
               \(x) unique(unlist(x)))
t2 <- table(unlist(t1))
t2 <- names(t2)[t2 == 1]
t1 <- lapply(t1, \(x) paste(x[x %in% t2], collapse = ", "))
data.frame(group = names(t1), word=unlist(t1))
#  group      word
#A     A car, tree
#B     B       sun

Or another way starting with the already used aggregate in the question.

t1 <- aggregate(word ~ group , data = example,  FUN = toString)
t2 <- lapply(strsplit(t1$word, "[, ] "), unique)
t3 <- table(unlist(t2))
t3 <- names(t3)[t3 == 1]
t1$word <- lapply(t2, \(x) x[x %in% t3])
t1
#  group      word
#1     A car, tree
#2     B       sun

And just for fun a Benchmark

library(bench)
library(dplyr)
library(tidyr)
library(tidyverse)

example = data.frame(group = c("A", "B", "A", "A"), word = c("car", "sun ,sun, house", "car, house", "tree"))

bench::mark(check = FALSE,
GKi = {t1 <- lapply(split(strsplit(example$word, "[, ] "), example$group),
               \(x) unique(unlist(x)))
t2 <- table(unlist(t1))
t2 <- names(t2)[t2 == 1]
t1 <- lapply(t1, \(x) paste(x[x %in% t2], collapse = ", "))
data.frame(group = names(t1), word=unlist(t1))},
GKi2 = {t1 <- aggregate(word ~ group , data = example,  FUN = toString)
t2 <- lapply(strsplit(t1$word, "[, ] "), unique)
t3 <- table(unlist(t2))
t3 <- names(t3)[t3 == 1]
t1$word <- lapply(t2, \(x) x[x %in% t3])
t1},
ThomasIsCoding = with(
  aggregate(
    word ~ .,
    example,
    function(x) {
      unlist(strsplit(x, ", "))
    }
  ),
  aggregate(
    . ~ ind,
    subset(
      unique(stack(setNames(word, group))),
      ave(seq_along(ind), values, FUN = length) == 1
    ),
    c
  )
),
Mael = {example %>% 
  separate_rows(word) %>% 
  distinct(group, word) %>% 
  group_by(word) %>% 
  filter(n() == 1) %>% 
  group_by(group) %>% 
    summarise(word = toString(word))},
"Nir Graham" = {example <- data.frame(group = c("A", "B", "A", "A"),
                      word = c("car", "sun ,sun, house", "car, house", "tree"))

(sep_df <- separate_rows(example,word,sep = ",") |> mutate_all(trimws) |> distinct())

(uniq_df <- sep_df|> group_by(word) |> count() |> filter(n==1))

(result_df <- inner_join(sep_df,uniq_df) |> group_by(group) |> summarise(word=paste0(word,collapse=", ")))
}
)

Result

  expression          min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
  <bch:expr>     <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
1 GKi            445.13µs 486.26µs    1997.    16.03KB     6.15   974     3
2 GKi2           916.97µs 968.68µs    1023.      7.3KB     6.15   499     3
3 ThomasIsCoding   3.54ms   3.73ms     266.     8.19KB     8.45   126     4
4 Mael            16.07ms  16.48ms      60.1   60.04KB     6.68    27     3
5 Nir Graham      37.29ms  39.49ms      24.0   90.59KB     8.00     9     3

GKi is about 2 times faster than GKi2, 7 times faster than ThomasIsCoding, 30 than Mael and 80 than Nir Graham.

CodePudding user response：

library(tidyverse)

example <- data.frame(group = c("A", "B", "A", "A"),
                      word = c("car", "sun ,sun, house", "car, house", "tree"))

(sep_df <- separate_rows(example,word,sep = ",") |> mutate_all(trimws) |> distinct())

(uniq_df <- sep_df|> group_by(word) |> count() |> filter(n==1))

(result_df <- inner_join(sep_df,uniq_df) |> group_by(group) |> summarise(word=paste0(word,collapse=", ")))