r -- apply method for character counting for a frequency-weighted wordlist-CodePudding

Ok, I have a list of words with their frequencies. There are many, many thousands of these. Here's a mini example:

w = c("abandon", "break", "fuzz", "when")
f = c(2, 10, 8, 200)
df = data.frame(cbind(w, f))
df

        w   f
1 abandon   2
2   break  10
3    fuzz   8
4    when 200

What I want to do is count the characters in each word and then aggregate the results. The count_chars function from the dw4psy package can do this for a given vector of strings. I've done this successfully by just creating a giant vector of strings from the word list (which has 10s of 1000s of words), as follows:

library(ds4psy) # for count_chars function 
library(dplyr)

w = c("abandon", "break", "fuzz", "when")
f = c(2, 10, 8, 200)
df = data.frame(cbind(w, f))
df$w = as.character(df$w)
df$f = as.integer(df$f)

# repword will repeat wrd frq times with no spaces between
repword <- function(frq, wrd) paste(rep(times=frq, x=wrd), collapse="")

# now we create one giant vector of strings to do the counts on 
# CAUTION -- uses lots of memory when you have 10s of 1000s of words
mytext = paste(mapply(repword,  df$f, df$w))

# get a table of letter counts
mycounts = count_chars(mytext)

# convert to data frame sorted by character
mycounts.df <- mycounts[order(names(mycounts))] %>%
  as.data.frame()

# sort by Freq in descending order
mycounts.df %>% 
  arrange(desc(Freq))

However, a colleague does not have enough memory for this brute force solution. So I tried to figure out how to do this word-by-word using foreach or mapply, but I am really stuck.

One issue is that you need a vector that has every letter in it to combine them (so far as I can tell). So I create a dummy word with all letters in it, and then do some tweaks to keep it from counting the repeated letters each time.

# create a dummy string that is a-z
dummy = paste0(letters, collapse="")
# now we create a count - it will be all 1s; we will subtract it every time
dummycount = count_chars(dummy)


countword <- function(frq, wrd) {
  myword = paste0(dummy, wrd, collapse="")
  # subtract 1 from each letter to correct for dummy
  mycount = count_chars(myword) - dummycount 
  mycount = mycount * frq # multiply by frequency
  return(mycount)
}

totalcount = dummycount - 1 # set a table to zeroes


foreach(frq = df$f, wrd = df$w) %do% {
 totalcount = totalcount   countword(frq, wrd)
}

But this just doesn't work ... I get a weird result:


> totalcount
chars
 a  b  c  d  e  f  g  h  i  j  k  l  m  n  o  p  q  r  s  t  u  v  w  x  y  z 
16 12 10  6  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0

I would be very grateful for any advice!

CodePudding user response：

Can you simply multiply the output of count_chars() by f, and do this by row?

library(data.table)
setDT(df)[, data.table(count_chars(w)*f), by=1:nrow(df)][, .(ct = sum(N)), chars][order(-ct)]

Output:

    chars  ct
 1:     e 210
 2:     n 204
 3:     h 200
 4:     w 200
 5:     z  16
 6:     a  14
 7:     b  12
 8:     k  10
 9:     r  10
10:     f   8
11:     u   8
12:     d   2
13:     o   2

CodePudding user response：

Trying to reply to @langtang but can't format the answer...

This seems terrific, but does not quite work for me. Here's a slightly tweaked df (including 2 homophone entries, as break could be a noun or a verb), and you'll see the output is not as expected, but I can't figure out what I've done wrong!

library(ds4psy) # for count_chars function 
library(data.table)

w = c("abandon", "break", "break", "fuzz", "when")
f = c(222, 10, 3, 8, 200)
df = data.frame(cbind(w, f))
df$w = as.character(df$w)
df$f = as.integer(df$f)
setDT(df)[, data.table(count_chars(w)*f), by=1:nrow(df)][, .(ct = sum(N)), chars][order(-ct)]

Output:

    chars ct
 1:     a 11
 2:     z 10
 3:     n  8
 4:     b  8
 5:     e  7
 6:     k  5
 7:     r  5
 8:     f  5
 ...

CodePudding user response：

If we want the same output with foreach (assuming OP wants to work with foreach), simply loop over the sequence of rows

library(foreach)
library(parallel)
library(doSNOW)
no_of_cores = detectCores()
 cl <- makeSOCKcluster(no_of_cores)
 registerDoSNOW(cl)
 out <- foreach(i = 1:nrow(df), .export = "count_chars", 
    .combine = ` `) %dopar% {
     tmp <- countword(df$f[i], df$w[i])
     totalcount[names(tmp)] <- totalcount[names(tmp)]   tmp
    totalcount}
stopCluster(cl)

-output

> out
  a   b   c   d   e   f   g   h   i   j   k   l   m   n   o   p   q   r   s   t   u   v   w   x   y   z 
 14  12   0   2 210   8   0 200   0   0  10   0   0 204   2   0   0  10   0   0   8   0 200   0   0  16