Find overlapping letters in words-CodePudding

I have a string with only three words like this:

first_string <- c("self", "funny", "nymph")

As you can see the words of this vector can all be put together to one word because there is some overlap in letters, i.e. we get selfunnymph. Let`s call this a word train.

Besides, I have another vector with many words. Let the second vector be:

second_string <- c("house", "garden", "duck", "evil", "fluff")

I want to know what words of the second string can be added to the word train. In this case this is house and fluff (house can be added in the end of selfunnymph and fluff can be put between self and funny). So the expected output here would be:

expected <- data.frame(word= c("house", "fluff"), word_train= c("selfunnymphouse", "selfluffunnymph"))

The overlap can be of any length, i.e. self and funny overlap only with one character but funny and nymph overlap in two characters.

EDIT

The new word can change the word order of the first word train. For example, if the second vector contains the word hugs we can make the word train nymphugselfunny, which puts nymph before self and funny.

CodePudding user response：

It turned out to be much harded than I thought but this is what I ended up doing:

Make an matrix with the first n letters of each word and another matrix with last n letters of each word
Comparing the two matrices shows which words overlap
Paste overlaping words to a word train
Repeating the steps above until there is no new overlap

Running the code for my data from question gave me such long word trains as I did not expect while writing the question, with the longest word trains being gardenymphouselfluffunny and selfluffunnymphousevil (both contain 6 words). The output data is:

                               wagons                    train
fluffunnymphouself       fluff, f....       fluffunnymphouself
funnymphouselfluff       funny, n....       funnymphouselfluff
gardenymphouselfluffunny garden, .... gardenymphouselfluffunny
gardenymphouselfunny     garden, ....     gardenymphouselfunny
houselfluffunnymph       house, s....       houselfluffunnymph
houselfunnymph           house, s....           houselfunnymph
selfluffunnymphousevil   fluff, f....   selfluffunnymphousevil
selfunnymphousevil       funny, n....       selfunnymphousevil
# The column wagons is a list of different length, depending on the words that are in the word train.

The code is quite long though..

# Vectors from question.
first_string <- c("self", "funny", "nymph")
second_string <- c("house", "garden", "duck", "evil", "fluff")

# Prepating the while loop which only runs while there are any new_wagons to add to the train.
all_wagons <- tolower(c(first_string, second_string))
new_wagons <- TRUE
results <- data.frame(wagons= I(list("")), train= "")

# Start the while loop.
while(any(new_wagons, na.rm= TRUE)){
# Going though every train that has been made so far..
  all_results <- by(results, list(results$train), function(train_i){
# What wagons have been used for this train?
    used_wagons <- unique(unlist(train_i[ , "wagons"]))
    used_wagons <- used_wagons[used_wagons != ""]
# What wagons can be used to extend the train?
    wagons_to_use_from <- unique(c(all_wagons[!all_wagons %in% used_wagons], train_i[ , "train"]))

# Get the first n letters of every word.
    wagon_start <- as.data.frame(sapply(wagons_to_use_from, function(wagon_i){
      sapply(1:max(nchar(wagons_to_use_from)), function(length_i){
        substr(wagon_i, 1, length_i)
      })}))
# Get the last n letters of every word.
    wagon_end <- as.data.frame(sapply(wagons_to_use_from, function(wagon_i){
      sapply(0:(max(nchar(wagons_to_use_from)-1)), function(length_i){
        substr(wagon_i, nchar(wagon_i)-length_i, nchar(wagon_i))
      })}))
# Find the overlap in letters.
    find_overlap <- data.frame(word= rep(names(wagon_end), each= nrow(wagon_end)))
    find_overlap$word_end <- unlist(wagon_end[ , unique(find_overlap$word)])
    find_overlap$without_word <- wagon_start[rep(1:nrow(wagon_start), ncol(wagon_end)), , drop= FALSE]
    find_overlap$without_word[matrix(c(1:nrow(find_overlap),
                                   rep(1:ncol(wagon_start), each= nrow(wagon_end))),
                                 ncol= 2)] <- NA
    new_wagons <- find_overlap$word_end == find_overlap$without_word

# If there is no new overlap then return the data as it was.
    if(!any(new_wagons, na.rm= TRUE)){
      results <- train_i
    } else{
# If there is an overlap then save the relevant words.
      word_i <- find_overlap$word[sort(which(new_wagons == TRUE, arr.ind = TRUE)[ , "row"])]
      word_overlap <- find_overlap$word_end[sort(which(new_wagons == TRUE, arr.ind = TRUE)[ , "row"])]
      word_after_i <- colnames(new_wagons)[which(t(new_wagons) == TRUE, arr.ind = TRUE)[, "row"]]
  
      word_trains <- data.frame(word_i, word_overlap, word_after_i, word_train= paste0(substr(word_i, 1, nchar(word_i)- nchar(word_overlap)),
                                                                                   word_after_i))
# Avoid former word trains as wagon names for next round:
      if(train_i$train != ""){
        word_trains <- word_trains[word_trains$word_i == train_i$train | word_trains$word_after_i == train_i$train, ]
      }
  # Output results where the former and new used words as well as the word train is.
      results <- do.call("rbind.data.frame", lapply(as.data.frame(t(word_trains)), function(word_trains_i){
        used_wagons_old <- used_wagons
        used_wagons_new <- c(word_trains_i[1], word_trains_i[3])
        wagons <- c(used_wagons_old, used_wagons_new)
        wagons <- wagons[wagons != train_i$train]
        wagons <- wagons[wagons != ""]
    
    
        data.frame(wagons= I(list(wagons)),
                   train= word_trains_i[4]
               
        )
    
      }))
    }


    list(results, new_wagons)

  })
# Make two dataframes, one with the word results, one with logicals whether there is any overlap.
  results <- do.call(rbind, lapply(all_results, `[[`, 1))
  results <- results[!duplicated(results$train), ]
  new_wagons <- unlist(do.call(list, lapply(all_results, `[[`, 2)))

}

CodePudding user response：

I'm wondering why you asked this, but it was a fun exercise regardless. Here's my implementation:

library('dplyr')


# define cars -------------------------------------------------------------

original_cars <- c("self", "funny", "nymph")
new_cars <- c("house", "garden", "duck", "evil", "fluff")
cars <- c(original_cars, new_cars)


# get all possible connections ('parts') per car --------------------------

car_parts <- lapply(seq_along(cars), \(car_id) {
  
  car = cars[car_id]
  n = nchar(car)
  
  ids <- rep(car_id, n)
  names <- rep(car, n)
  left <- vapply(seq_len(n), \(i) substr(car, 1, i), "")
  right <- vapply(seq_len(n), \(i) substr(car, n-i 1, n), "")
  overlap <- nchar(left)
  
  data.frame(car.id = ids, car.name = names, left = left, right = right, overlap = overlap)
  
}) |> do.call(rbind, args=_)

# > car_parts
#    car.id car.name   left  right overlap
# 1       1     self      s      f       1
# 2       1     self     se     lf       2
# 3       1     self    sel    elf       3
# 4       1     self   self   self       4
# 5       2    funny      f      y       1
# 6       2    funny     fu     ny       2
# 7       2    funny    fun    nny       3
# 8       2    funny   funn   unny       4
# 9       2    funny  funny  funny       5
# 10      3    nymph      n      h       1
# [...]


# get all possible connections between two cars ---------------------------

connections <- inner_join(car_parts |> select(-left),
           car_parts |> select(-right),
           by = c('overlap', 'right' = 'left'),
           suffix = c('.left', '.right')) |>
  filter(car.id.left != car.id.right) |>
  mutate(connection.id = row_number()) |>
  select(connection.id, car.id.left, car.id.right, car.name.left, car.name.right, coupling = right)

rm(car_parts)

# > connections
#   connection.id car.id.left car.id.right car.name.left car.name.right coupling
# 1             1           1            2          self          funny        f
# 2             2           1            8          self          fluff        f
# 3             3           2            3         funny          nymph       ny
# 4             4           3            4         nymph          house        h
# 5             5           4            7         house           evil        e
# 6             6           4            1         house           self       se
# 7             7           5            3        garden          nymph        n
# 8             8           8            2         fluff          funny        f


# function to store valid trains ------------------------------------------

# example:
# valid_trains <- list()
# valid_trains <- add_valid_train( valid_trains, c(1, 8), c(2) )

add_valid_train <- function(valid_trains, train_cars, train_connections) {
  
  names = c(cars[train_cars[1]],
            vapply(train_connections, \(x) connections$car.name.right[x], "") )
  
  couplings = vapply(train_connections, \(x) connections$coupling[x], "")
  
  append(valid_trains, list(list(cars = train_cars, names = names, couplings = couplings)))
  
}


# function to recursively find next cars to add to train ------------------

# example:
# add_car(9, 5, c(1,2,3), c(1,3,5))

add_car <- function(valid_trains, new_car, new_connection = NULL, train_cars = c(), train_connections = c(), depth = 0) {
  
  cat(strrep('   ',depth), cars[new_car],'\n', sep='')
  
  # store current train as valid
  train_cars <- c(train_cars, new_car)
  train_connections <- c(train_connections, new_connection)
  
  # find next possible cars to add; save train if no more options, otherwise add all options
  options <- connections |> filter(car.id.left == new_car, ! car.id.right %in% train_cars)
  if(nrow(options) == 0) valid_trains <- add_valid_train(valid_trains, train_cars, train_connections) # save only the longest options
  for(i in seq_len(nrow(options))) valid_trains <- add_car(valid_trains, options$car.id.right[i], options$connection.id[i], train_cars, train_connections, depth 1)
  
  return(valid_trains)
  
}


# get all valid trains ----------------------------------------------------

valid_trains <- list()
for(i in seq_along(cars)) add_car(valid_trains, i) -> valid_trains

# filter valid trains that have all cars from `original_cars` -------------

mask <- vapply(valid_trains, \(x) all(seq_along(original_cars) %in% x$cars), T)

new_trains <- lapply(valid_trains[mask], \(x) {
  x$newcars <- setdiff(x$cars, seq_along(original_cars))
  x$newnames <- cars[x$newcars]
  x
})

# print names of all trains that contain all 'original' cars:
#
# > sapply(new_trains, \(x) x$names)
# [[1]] "self"  "funny" "nymph" "house" "evil" 
# [[2]] "self"  "fluff" "funny" "nymph" "house" "evil" 
# [[3]] "funny" "nymph" "house" "self"  "fluff"
# [[4]] "nymph" "house" "self"  "funny"
# [[5]] "nymph" "house" "self"  "fluff" "funny"
# [[6]] "house" "self"  "funny" "nymph"
# [[7]] "house" "self"  "fluff" "funny" "nymph"
# [[8]] "garden" "nymph"  "house"  "self"   "funny" 
# [[9]] "garden" "nymph"  "house"  "self"   "fluff"  "funny" 
# [[10]] "fluff" "funny" "nymph" "house" "self" 

## All possible trains are in `valid_trains`, all of those where *all* the original cars are used are in `new_trains`.
## 
## It is possible that some trains are subsets of others.