I have this dataset in R:
set.seed(123)
myFun <- function(n = 5000) {
a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
paste0(a, sprintf("d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}
col1 = myFun(100)
col2 = myFun(100)
col3 = myFun(100)
col4 = myFun(100)
group <- c("A","B","C","D")
group = sample(group, 100, replace=TRUE)
example = data.frame(col1, col2, col3, col4, group)
col1 col2 col3 col4 group
1 SKZDZ9876D BTAMF8110T LIBFV6882H ZFIPL4295E A
2 NXJRX7189Y AIZGY5809C HSMIH4556D YJGJP8022H C
3 XPTZB2035P EEKXK0873A PCPNW1021S NMROS4134O A
4 LJMCM3436S KGADK2847O SRMUI5723N RDIXI7301N B
5 ADITC6567L HUOCT5660P AQCNE3753K FUMGY1428B D
6 BAEDP8491P IAGQG4816B TXXQH6337M SDACH5752D C
I am now trying to run the following double loop:
library(stringdist)
method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw","soundex")
results = list()
l = length(unique(example$group))
for (j in 1:l) {
for (i in 1:length(method)) {
g = unique(example$group)
groups_j = g[j]
my_data_i = example[which(example$group == groups_j ), ]
method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
name_2_i = paste0("col3_col_4", method_i)
p1_i = stringdistmatrix(my_data_i$col1, my_data_i$col2, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1_i)
p2_i = stringdistmatrix(my_data_i$col3, my_data_i$col4, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2_i)
p1_i = p1_i[,3]
p2_i = p2_i[,3]
final_i = cbind(p1_i, p2_i, groups_j)
results[[i]] = final_i
}
}
final = do.call(cbind.data.frame, results)
The loop seems to run - but when I inspect the final results, I noticed that the other indices in the "j" loop seem to have been ignored:
> table(final$groups_j)
A
441
As we can see the original data, there appears to be 4 groups:
> table(example$group)
A B C D
21 28 19 32
Can someone please help me figure out why the other 3 groups are not being processed by my loop?
Thank you!
CodePudding user response:
This is not supposed to be a proper answer. I was just playing around with your code a bit. Nevertheless, it might help you debug it.
library(stringdist)
library(tidyverse)
results = list()
res_j <- list()
l = length(unique(example$group))
g = unique(example$group)
for (j in 1:l) {
groups_j = g[j]
for (i in 1:length(method)) {
my_data_i = example[which(example$group == groups_j ), ]
method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
name_2_i = paste0("col3_col_4", method_i)
p1_i = stringdistmatrix(my_data_i$col1, my_data_i$col2, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1_i)
p2_i = stringdistmatrix(my_data_i$col3, my_data_i$col4, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2_i)
p1_i = p1_i[,3]
p2_i = p2_i[,3]
final_i = cbind(p1_i, p2_i)
results[[i]] = final_i
}
res_j[[j]] <- flatten(results)
res_j[[j]]$group <- groups_j
}
test <- map_dfr(res_j, as.tibble)
# here’s a summary table of the result set.
library(gtExtras)
gt_plt_summary(test)
CodePudding user response:
Here is a way.
Instead of unique(example$group)
and looping through the data set using this values, split
by group and lapply
the inner for
loop to the sub-data sets.
set.seed(123)
myFun <- function(n = 5000) {
a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
paste0(a, sprintf("d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}
col1 = myFun(100)
col2 = myFun(100)
col3 = myFun(100)
col4 = myFun(100)
group <- c("A","B","C","D")
group = sample(group, 100, replace=TRUE)
example = data.frame(col1, col2, col3, col4, group)
head(example)
#> col1 col2 col3 col4 group
#> 1 OOPBR0319H XFNIX1029D UFTLD7446Q LLRTH2385Q C
#> 2 SUWML2894Y JWGSU4238I HRGIF0793H MTHSV3221Z B
#> 3 NEAXO7570I OQWCR4065E EQVSJ7607Y PTIGN4766W D
#> 4 CHHQS1666T ONOBS9571P EMLSS6601V JEFZH0164K D
#> 5 JSHCU8312A TGWWI3712K SLKFF4079K EXKGJ1406W A
#> 6 RJJRF2760C LMWLS5552P LORMI7587V OYPGF5046D C
suppressPackageStartupMessages({
library(stringdist)
library(magrittr)
library(tidyr)
})
method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw","soundex")
ex_split <- split(example, example$group)
temp <- vector("list", length = length(method))
results <- lapply(ex_split, \(x) {
group <- x$group[1]
for (i in seq_along(method)) {
name_1 <- paste0("col1_col_2_", method[i])
name_2 <- paste0("col3_col_4_", method[i])
p1 <- stringdistmatrix(x$col1, x$col2, method = method[i], useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1)
p2 <- stringdistmatrix(x$col3, x$col4, method = method[i], useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2)
temp[[i]] <- cbind(p1[3], p2[3])
}
y <- do.call(cbind.data.frame, temp)
y$group <- group
y
})
final <- do.call(rbind.data.frame, results)
row.names(final) <- NULL
str(final)
#> 'data.frame': 2610 obs. of 21 variables:
#> $ col1_col_2_osa : num 8 10 10 9 10 7 9 9 9 10 ...
#> $ col3_col_4_osa : num 8 9 9 10 10 9 10 10 9 8 ...
#> $ col1_col_2_lv : num 8 10 10 9 10 7 9 9 9 10 ...
#> $ col3_col_4_lv : num 8 9 9 10 10 9 10 10 9 9 ...
#> $ col1_col_2_dl : num 8 10 10 9 10 7 9 9 9 10 ...
#> $ col3_col_4_dl : num 8 9 9 10 10 9 10 10 9 8 ...
#> $ col1_col_2_hamming: num 8 10 10 9 10 9 9 9 9 10 ...
#> $ col3_col_4_hamming: num 9 9 9 10 10 9 10 10 9 9 ...
#> $ col1_col_2_lcs : num 14 18 16 18 18 12 16 16 14 18 ...
#> $ col3_col_4_lcs : num 14 18 16 16 18 14 16 18 18 16 ...
#> $ col1_col_2_qgram : num 14 18 16 14 18 12 16 14 14 16 ...
#> $ col3_col_4_qgram : num 14 18 16 16 18 12 16 18 18 14 ...
#> $ col1_col_2_cosine : num 0.726 0.817 0.8 0.763 0.915 ...
#> $ col3_col_4_cosine : num 0.662 0.923 0.831 0.746 0.923 ...
#> $ col1_col_2_jaccard: num 0.812 0.944 0.889 0.8 0.941 ...
#> $ col3_col_4_jaccard: num 0.8 0.938 0.875 0.875 0.938 ...
#> $ col1_col_2_jw : num 0.467 0.6 0.533 0.578 0.6 ...
#> $ col3_col_4_jw : num 0.467 0.6 0.533 0.533 0.6 ...
#> $ col1_col_2_soundex: num 1 1 1 1 1 1 1 1 1 1 ...
#> $ col3_col_4_soundex: num 1 1 1 1 1 1 1 1 1 1 ...
#> $ group : chr "A" "A" "A" "A" ...
Created on 2022-11-27 with reprex v2.0.2