I have a string with only three words like this:
first_string <- c("self", "funny", "nymph")
As you can see the words of this vector can all be put together to one word because there is some overlap in letters, i.e. we get selfunnymph. Let`s call this a word train.
Besides, I have another vector with many words. Let the second vector be:
second_string <- c("house", "garden", "duck", "evil", "fluff")
I want to know what words of the second string can be added to the word train. In this case this is house
and fluff
(house
can be added in the end of selfunnymph and fluff
can be put between self
and funny
). So the expected output here would be:
expected <- data.frame(word= c("house", "fluff"), word_train= c("selfunnymphouse", "selfluffunnymph"))
The overlap can be of any length, i.e. self and funny overlap only with one character but funny and nymph overlap in two characters.
EDIT
The new word can change the word order of the first word train. For example, if the second vector contains the word hugs
we can make the word train nymphugselfunny, which puts nymph
before self
and funny
.
CodePudding user response:
It turned out to be much harded than I thought but this is what I ended up doing:
- Make an matrix with the first n letters of each word and another matrix with last n letters of each word
- Comparing the two matrices shows which words overlap
- Paste overlaping words to a word train
- Repeating the steps above until there is no new overlap
Running the code for my data from question gave me such long word trains as I did not expect while writing the question, with the longest word trains being gardenymphouselfluffunny
and selfluffunnymphousevil
(both contain 6 words). The output data is:
wagons train
fluffunnymphouself fluff, f.... fluffunnymphouself
funnymphouselfluff funny, n.... funnymphouselfluff
gardenymphouselfluffunny garden, .... gardenymphouselfluffunny
gardenymphouselfunny garden, .... gardenymphouselfunny
houselfluffunnymph house, s.... houselfluffunnymph
houselfunnymph house, s.... houselfunnymph
selfluffunnymphousevil fluff, f.... selfluffunnymphousevil
selfunnymphousevil funny, n.... selfunnymphousevil
# The column wagons is a list of different length, depending on the words that are in the word train.
The code is quite long though..
# Vectors from question.
first_string <- c("self", "funny", "nymph")
second_string <- c("house", "garden", "duck", "evil", "fluff")
# Prepating the while loop which only runs while there are any new_wagons to add to the train.
all_wagons <- tolower(c(first_string, second_string))
new_wagons <- TRUE
results <- data.frame(wagons= I(list("")), train= "")
# Start the while loop.
while(any(new_wagons, na.rm= TRUE)){
# Going though every train that has been made so far..
all_results <- by(results, list(results$train), function(train_i){
# What wagons have been used for this train?
used_wagons <- unique(unlist(train_i[ , "wagons"]))
used_wagons <- used_wagons[used_wagons != ""]
# What wagons can be used to extend the train?
wagons_to_use_from <- unique(c(all_wagons[!all_wagons %in% used_wagons], train_i[ , "train"]))
# Get the first n letters of every word.
wagon_start <- as.data.frame(sapply(wagons_to_use_from, function(wagon_i){
sapply(1:max(nchar(wagons_to_use_from)), function(length_i){
substr(wagon_i, 1, length_i)
})}))
# Get the last n letters of every word.
wagon_end <- as.data.frame(sapply(wagons_to_use_from, function(wagon_i){
sapply(0:(max(nchar(wagons_to_use_from)-1)), function(length_i){
substr(wagon_i, nchar(wagon_i)-length_i, nchar(wagon_i))
})}))
# Find the overlap in letters.
find_overlap <- data.frame(word= rep(names(wagon_end), each= nrow(wagon_end)))
find_overlap$word_end <- unlist(wagon_end[ , unique(find_overlap$word)])
find_overlap$without_word <- wagon_start[rep(1:nrow(wagon_start), ncol(wagon_end)), , drop= FALSE]
find_overlap$without_word[matrix(c(1:nrow(find_overlap),
rep(1:ncol(wagon_start), each= nrow(wagon_end))),
ncol= 2)] <- NA
new_wagons <- find_overlap$word_end == find_overlap$without_word
# If there is no new overlap then return the data as it was.
if(!any(new_wagons, na.rm= TRUE)){
results <- train_i
} else{
# If there is an overlap then save the relevant words.
word_i <- find_overlap$word[sort(which(new_wagons == TRUE, arr.ind = TRUE)[ , "row"])]
word_overlap <- find_overlap$word_end[sort(which(new_wagons == TRUE, arr.ind = TRUE)[ , "row"])]
word_after_i <- colnames(new_wagons)[which(t(new_wagons) == TRUE, arr.ind = TRUE)[, "row"]]
word_trains <- data.frame(word_i, word_overlap, word_after_i, word_train= paste0(substr(word_i, 1, nchar(word_i)- nchar(word_overlap)),
word_after_i))
# Avoid former word trains as wagon names for next round:
if(train_i$train != ""){
word_trains <- word_trains[word_trains$word_i == train_i$train | word_trains$word_after_i == train_i$train, ]
}
# Output results where the former and new used words as well as the word train is.
results <- do.call("rbind.data.frame", lapply(as.data.frame(t(word_trains)), function(word_trains_i){
used_wagons_old <- used_wagons
used_wagons_new <- c(word_trains_i[1], word_trains_i[3])
wagons <- c(used_wagons_old, used_wagons_new)
wagons <- wagons[wagons != train_i$train]
wagons <- wagons[wagons != ""]
data.frame(wagons= I(list(wagons)),
train= word_trains_i[4]
)
}))
}
list(results, new_wagons)
})
# Make two dataframes, one with the word results, one with logicals whether there is any overlap.
results <- do.call(rbind, lapply(all_results, `[[`, 1))
results <- results[!duplicated(results$train), ]
new_wagons <- unlist(do.call(list, lapply(all_results, `[[`, 2)))
}
CodePudding user response:
I'm wondering why you asked this, but it was a fun exercise regardless. Here's my implementation:
library('dplyr')
# define cars -------------------------------------------------------------
original_cars <- c("self", "funny", "nymph")
new_cars <- c("house", "garden", "duck", "evil", "fluff")
cars <- c(original_cars, new_cars)
# get all possible connections ('parts') per car --------------------------
car_parts <- lapply(seq_along(cars), \(car_id) {
car = cars[car_id]
n = nchar(car)
ids <- rep(car_id, n)
names <- rep(car, n)
left <- vapply(seq_len(n), \(i) substr(car, 1, i), "")
right <- vapply(seq_len(n), \(i) substr(car, n-i 1, n), "")
overlap <- nchar(left)
data.frame(car.id = ids, car.name = names, left = left, right = right, overlap = overlap)
}) |> do.call(rbind, args=_)
# > car_parts
# car.id car.name left right overlap
# 1 1 self s f 1
# 2 1 self se lf 2
# 3 1 self sel elf 3
# 4 1 self self self 4
# 5 2 funny f y 1
# 6 2 funny fu ny 2
# 7 2 funny fun nny 3
# 8 2 funny funn unny 4
# 9 2 funny funny funny 5
# 10 3 nymph n h 1
# [...]
# get all possible connections between two cars ---------------------------
connections <- inner_join(car_parts |> select(-left),
car_parts |> select(-right),
by = c('overlap', 'right' = 'left'),
suffix = c('.left', '.right')) |>
filter(car.id.left != car.id.right) |>
mutate(connection.id = row_number()) |>
select(connection.id, car.id.left, car.id.right, car.name.left, car.name.right, coupling = right)
rm(car_parts)
# > connections
# connection.id car.id.left car.id.right car.name.left car.name.right coupling
# 1 1 1 2 self funny f
# 2 2 1 8 self fluff f
# 3 3 2 3 funny nymph ny
# 4 4 3 4 nymph house h
# 5 5 4 7 house evil e
# 6 6 4 1 house self se
# 7 7 5 3 garden nymph n
# 8 8 8 2 fluff funny f
# function to store valid trains ------------------------------------------
# example:
# valid_trains <- list()
# valid_trains <- add_valid_train( valid_trains, c(1, 8), c(2) )
add_valid_train <- function(valid_trains, train_cars, train_connections) {
names = c(cars[train_cars[1]],
vapply(train_connections, \(x) connections$car.name.right[x], "") )
couplings = vapply(train_connections, \(x) connections$coupling[x], "")
append(valid_trains, list(list(cars = train_cars, names = names, couplings = couplings)))
}
# function to recursively find next cars to add to train ------------------
# example:
# add_car(9, 5, c(1,2,3), c(1,3,5))
add_car <- function(valid_trains, new_car, new_connection = NULL, train_cars = c(), train_connections = c(), depth = 0) {
cat(strrep(' ',depth), cars[new_car],'\n', sep='')
# store current train as valid
train_cars <- c(train_cars, new_car)
train_connections <- c(train_connections, new_connection)
# find next possible cars to add; save train if no more options, otherwise add all options
options <- connections |> filter(car.id.left == new_car, ! car.id.right %in% train_cars)
if(nrow(options) == 0) valid_trains <- add_valid_train(valid_trains, train_cars, train_connections) # save only the longest options
for(i in seq_len(nrow(options))) valid_trains <- add_car(valid_trains, options$car.id.right[i], options$connection.id[i], train_cars, train_connections, depth 1)
return(valid_trains)
}
# get all valid trains ----------------------------------------------------
valid_trains <- list()
for(i in seq_along(cars)) add_car(valid_trains, i) -> valid_trains
# filter valid trains that have all cars from `original_cars` -------------
mask <- vapply(valid_trains, \(x) all(seq_along(original_cars) %in% x$cars), T)
new_trains <- lapply(valid_trains[mask], \(x) {
x$newcars <- setdiff(x$cars, seq_along(original_cars))
x$newnames <- cars[x$newcars]
x
})
# print names of all trains that contain all 'original' cars:
#
# > sapply(new_trains, \(x) x$names)
# [[1]] "self" "funny" "nymph" "house" "evil"
# [[2]] "self" "fluff" "funny" "nymph" "house" "evil"
# [[3]] "funny" "nymph" "house" "self" "fluff"
# [[4]] "nymph" "house" "self" "funny"
# [[5]] "nymph" "house" "self" "fluff" "funny"
# [[6]] "house" "self" "funny" "nymph"
# [[7]] "house" "self" "fluff" "funny" "nymph"
# [[8]] "garden" "nymph" "house" "self" "funny"
# [[9]] "garden" "nymph" "house" "self" "fluff" "funny"
# [[10]] "fluff" "funny" "nymph" "house" "self"
## All possible trains are in `valid_trains`, all of those where *all* the original cars are used are in `new_trains`.
##
## It is possible that some trains are subsets of others.