Count Unique Word Matches in Column-CodePudding

I am interested in counting the unique number of matches in a column to a list of words. I want to count to be in a new column in the dataframe, so that each row has a count.

For example:

person_id <- c("001", "002", "003")
grocery_list <- c("apple orange orange kiwi", "eggs milk apple apple", "apple orange banana")

df <- data.frame(person_id, grocery_list)

fruit_list <- c("apple", "orange", "banana") 

The output would be:
person_id grocery_list                   fruit_count
001       apple orange orange kiwi       2
002       eggs milk apple apple          1
003       apple orange banana            3

CodePudding user response：

This should do it:

library(tidyverse)
person_id <- c("001", "002", "003")
grocery_list <- c("apple orange orange kiwi", "eggs milk apple apple", "apple orange banana")

df <- data.frame(person_id, grocery_list)

fruit_list <- c("apple", "orange", "banana") 


df %>% 
  rowwise() %>% 
  mutate(fruit_count = sum(str_detect(grocery_list, fruit_list)))
#> # A tibble: 3 × 3
#> # Rowwise: 
#>   person_id grocery_list             fruit_count
#>   <chr>     <chr>                          <int>
#> 1 001       apple orange orange kiwi           2
#> 2 002       eggs milk apple apple              1
#> 3 003       apple orange banana                3

^{Created on 2022-06-03 by the reprex package (v2.0.1)}

CodePudding user response：

You can do something like this:

df["fruit_count"] = sapply(df$grocery_list, \(s) sum(fruit_list %in% strsplit(s," ")[[1]]))

Output:

  person_id             grocery_list fruit_count
1       001 apple orange orange kiwi           2
2       002    eggs milk apple apple           1
3       003      apple orange banana           3

CodePudding user response：

in Base R you would use a Vectorized version of grepl and then do a rowSums

df$fruit_count <- rowSums(Vectorize(grepl, 'pattern')(fruit_list, df$grocery_list))
df
  person_id             grocery_list fruit_count
1       001 apple orange orange kiwi           2
2       002    eggs milk apple apple           1
3       003      apple orange banana           3

CodePudding user response：

Try

transform(
    df,
    fruit_count = rowSums(sapply(fruit_list, grepl, grocery_list))
)

which gives

  person_id             grocery_list fruit_count
1       001 apple orange orange kiwi           2
2       002    eggs milk apple apple           1
3       003      apple orange banana           3

CodePudding user response：

With the help of @akrun here is a solution with str_count: Shortest way to remove duplicate words from string

library(dplyr)
library(stringr)

df %>% 
  rowwise() %>% 
  mutate(count = str_count(paste(unique(unlist(strsplit(grocery_list, " "))), collapse = " ") , paste(fruit_list, collapse = "|")))

  person_id grocery_list             count
  <chr>     <chr>                    <int>
1 001       apple orange orange kiwi     2
2 002       eggs milk apple apple        1
3 003       apple orange banana          3

CodePudding user response：

You can take the lengths of gregexpr hits. (?!.*\\b\\1\\b) is a negative look ahead to test if what was captured before apple|orange|banana has no hit afterwards.

df$fruit_count <- lengths(gregexpr(paste0("\\b(", paste(fruit_list
 , collapse="|"), ")\\b\\s*(?!.*\\b\\1\\b)"), df$grocery_list, perl=TRUE))

df
#  person_id             grocery_list fruit_count
#1       001 apple orange orange kiwi           2
#2       002    eggs milk apple apple           1
#3       003      apple orange banana           3

Just for fun a benchmark!

person_id <- c("001", "002", "003")
grocery_list <- c("apple orange orange kiwi", "eggs milk apple apple", "apple orange banana")
df <- data.frame(person_id, grocery_list)
fruit_list <- c("apple", "orange", "banana") 

library(magrittr)

bench::mark(check = FALSE,
 DaveArmstrong = (\(df) {df %>% 
               dplyr::rowwise() %>% 
               dplyr::mutate(fruit_count = sum(stringr::str_detect(grocery_list, fruit_list)))})(df),
 onyambu = (\(df) {df$fruit_count <- rowSums(Vectorize(grepl, 'pattern')(fruit_list, df$grocery_list))
   df})(df),
 langtang = (\(df) {df["fruit_count"] = sapply(df$grocery_list, \(s) sum(fruit_list %in% strsplit(s," ")[[1]]))})(df),
 ThomasIsCoding = (\(df) {transform(
    df,
    fruit_count = rowSums(sapply(fruit_list, grepl, grocery_list))
    )})(df),
 TarJae = (\(df) {df %>% 
  dplyr::rowwise() %>% 
  dplyr::mutate(count = stringr::str_count(paste(unique(unlist(strsplit(grocery_list, " "))), collapse = " ") , paste(fruit_list, collapse = "|")))
})(df),
 GKi = (\(df) {df$fruit_count <- lengths(gregexpr(paste0("\\b(", paste(fruit_list
 , collapse="|"), ")\\b\\s*(?!.*\\b\\1\\b)"), df$grocery_list, perl=TRUE))
df})(df)
)

Result

  expression          min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
  <bch:expr>     <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
1 DaveArmstrong       2ms   2.08ms      417.     6.9KB     17.0   172     7
2 onyambu         73.18µs  77.33µs    11668.        0B     25.1  5571    12
3 langtang        44.89µs  48.59µs    19261.        0B     23.2  9146    11
4 ThomasIsCoding 102.82µs 112.02µs     7055.        0B     18.7  3391     9
5 TarJae           2.06ms   2.12ms      412.     6.9KB     17.2   192     8
6 GKi             17.97µs  19.64µs    47069.    48.6KB     42.4  9991     9

In this case GKi is more than 2 times faster than langtang followed by onyambu and ThomasIsCoding. DaveArmstrong and TarJae are about 100 times slower than the fastest.