Home > Blockchain >  Loop Index Is Being Ignored in R?
Loop Index Is Being Ignored in R?

Time:11-27

I have this dataset in R:

set.seed(123)

myFun <- function(n = 5000) {
  a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
  paste0(a, sprintf("d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}

col1 = myFun(100)
col2 = myFun(100)
col3 = myFun(100)
col4 = myFun(100)
group <- c("A","B","C","D")
group = sample(group, 100, replace=TRUE)

example = data.frame(col1, col2, col3, col4, group)

       col1       col2       col3       col4 group
1 SKZDZ9876D BTAMF8110T LIBFV6882H ZFIPL4295E     A
2 NXJRX7189Y AIZGY5809C HSMIH4556D YJGJP8022H     C
3 XPTZB2035P EEKXK0873A PCPNW1021S NMROS4134O     A
4 LJMCM3436S KGADK2847O SRMUI5723N RDIXI7301N     B
5 ADITC6567L HUOCT5660P AQCNE3753K FUMGY1428B     D
6 BAEDP8491P IAGQG4816B TXXQH6337M SDACH5752D     C

I am now trying to run the following double loop:

library(stringdist)
method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw","soundex")

results = list()

l = length(unique(example$group))

 for (j in 1:l) {
for (i in 1:length(method)) {
   
        
        g = unique(example$group)

        groups_j = g[j]

        my_data_i = example[which(example$group == groups_j  ), ]
        
        
        method_i = method[i]
        name_1_i = paste0("col1_col_2", method_i)
        name_2_i = paste0("col3_col_4", method_i)
        
        p1_i = stringdistmatrix(my_data_i$col1, my_data_i$col2, method =  method_i, useNames = "string") %>%
            as_tibble(rownames = "a") %>%
            pivot_longer(-1, names_to = "b", values_to = name_1_i)
        
        p2_i = stringdistmatrix(my_data_i$col3, my_data_i$col4, method =  method_i, useNames = "string") %>%
            as_tibble(rownames = "a") %>%
            pivot_longer(-1, names_to = "b", values_to = name_2_i)
        
        p1_i = p1_i[,3]
        p2_i = p2_i[,3]
        
        final_i = cbind(p1_i, p2_i, groups_j)
        results[[i]] = final_i
        
    }
    
}

final = do.call(cbind.data.frame, results)

The loop seems to run - but when I inspect the final results, I noticed that the other indices in the "j" loop seem to have been ignored:

> table(final$groups_j)

  A 
441 

As we can see the original data, there appears to be 4 groups:

> table(example$group)

 A  B  C  D 
21 28 19 32 

Can someone please help me figure out why the other 3 groups are not being processed by my loop?

Thank you!

CodePudding user response:

This is not supposed to be a proper answer. I was just playing around with your code a bit. Nevertheless, it might help you debug it.

library(stringdist)
library(tidyverse)

results = list()
res_j <- list()

l = length(unique(example$group))
g = unique(example$group)

for (j in 1:l) {
  
  groups_j = g[j]
  
  for (i in 1:length(method)) {
  
    my_data_i = example[which(example$group == groups_j  ), ]
    
    method_i = method[i]
    name_1_i = paste0("col1_col_2", method_i)
    name_2_i = paste0("col3_col_4", method_i)
    
    p1_i = stringdistmatrix(my_data_i$col1, my_data_i$col2, method =  method_i, useNames = "string") %>%
      as_tibble(rownames = "a") %>%
      pivot_longer(-1, names_to = "b", values_to = name_1_i)
    
    p2_i = stringdistmatrix(my_data_i$col3, my_data_i$col4, method =  method_i, useNames = "string") %>%
      as_tibble(rownames = "a") %>%
      pivot_longer(-1, names_to = "b", values_to = name_2_i)
    
    p1_i = p1_i[,3]
    p2_i = p2_i[,3]
    
    final_i = cbind(p1_i, p2_i)
    results[[i]] = final_i
    
  }
  res_j[[j]] <- flatten(results)
  res_j[[j]]$group <- groups_j
}

test <- map_dfr(res_j, as.tibble) 

# here’s a summary table of the result set.
library(gtExtras)
gt_plt_summary(test) 

CodePudding user response:

Here is a way.

Instead of unique(example$group) and looping through the data set using this values, split by group and lapply the inner for loop to the sub-data sets.

set.seed(123)

myFun <- function(n = 5000) {
  a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
  paste0(a, sprintf("d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}

col1 = myFun(100)
col2 = myFun(100)
col3 = myFun(100)
col4 = myFun(100)
group <- c("A","B","C","D")
group = sample(group, 100, replace=TRUE)

example = data.frame(col1, col2, col3, col4, group)
head(example)
#>         col1       col2       col3       col4 group
#> 1 OOPBR0319H XFNIX1029D UFTLD7446Q LLRTH2385Q     C
#> 2 SUWML2894Y JWGSU4238I HRGIF0793H MTHSV3221Z     B
#> 3 NEAXO7570I OQWCR4065E EQVSJ7607Y PTIGN4766W     D
#> 4 CHHQS1666T ONOBS9571P EMLSS6601V JEFZH0164K     D
#> 5 JSHCU8312A TGWWI3712K SLKFF4079K EXKGJ1406W     A
#> 6 RJJRF2760C LMWLS5552P LORMI7587V OYPGF5046D     C

suppressPackageStartupMessages({
  library(stringdist)
  library(magrittr)
  library(tidyr)
})

method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw","soundex")

ex_split <- split(example, example$group)
temp <- vector("list", length = length(method))

results <- lapply(ex_split, \(x) {
  group <- x$group[1]
  for (i in seq_along(method)) {
    name_1 <- paste0("col1_col_2_", method[i])
    name_2 <- paste0("col3_col_4_", method[i])
    
    p1 <- stringdistmatrix(x$col1, x$col2, method = method[i], useNames = "string") %>%
      as_tibble(rownames = "a") %>%
      pivot_longer(-1, names_to = "b", values_to = name_1)
    
    p2 <- stringdistmatrix(x$col3, x$col4, method = method[i], useNames = "string") %>%
      as_tibble(rownames = "a") %>%
      pivot_longer(-1, names_to = "b", values_to = name_2)
    
    temp[[i]] <- cbind(p1[3], p2[3])
  }
  y <- do.call(cbind.data.frame, temp)
  y$group <- group
  y
})

final <- do.call(rbind.data.frame, results)
row.names(final) <- NULL
str(final)
#> 'data.frame':    2610 obs. of  21 variables:
#>  $ col1_col_2_osa    : num  8 10 10 9 10 7 9 9 9 10 ...
#>  $ col3_col_4_osa    : num  8 9 9 10 10 9 10 10 9 8 ...
#>  $ col1_col_2_lv     : num  8 10 10 9 10 7 9 9 9 10 ...
#>  $ col3_col_4_lv     : num  8 9 9 10 10 9 10 10 9 9 ...
#>  $ col1_col_2_dl     : num  8 10 10 9 10 7 9 9 9 10 ...
#>  $ col3_col_4_dl     : num  8 9 9 10 10 9 10 10 9 8 ...
#>  $ col1_col_2_hamming: num  8 10 10 9 10 9 9 9 9 10 ...
#>  $ col3_col_4_hamming: num  9 9 9 10 10 9 10 10 9 9 ...
#>  $ col1_col_2_lcs    : num  14 18 16 18 18 12 16 16 14 18 ...
#>  $ col3_col_4_lcs    : num  14 18 16 16 18 14 16 18 18 16 ...
#>  $ col1_col_2_qgram  : num  14 18 16 14 18 12 16 14 14 16 ...
#>  $ col3_col_4_qgram  : num  14 18 16 16 18 12 16 18 18 14 ...
#>  $ col1_col_2_cosine : num  0.726 0.817 0.8 0.763 0.915 ...
#>  $ col3_col_4_cosine : num  0.662 0.923 0.831 0.746 0.923 ...
#>  $ col1_col_2_jaccard: num  0.812 0.944 0.889 0.8 0.941 ...
#>  $ col3_col_4_jaccard: num  0.8 0.938 0.875 0.875 0.938 ...
#>  $ col1_col_2_jw     : num  0.467 0.6 0.533 0.578 0.6 ...
#>  $ col3_col_4_jw     : num  0.467 0.6 0.533 0.533 0.6 ...
#>  $ col1_col_2_soundex: num  1 1 1 1 1 1 1 1 1 1 ...
#>  $ col3_col_4_soundex: num  1 1 1 1 1 1 1 1 1 1 ...
#>  $ group             : chr  "A" "A" "A" "A" ...

Created on 2022-11-27 with reprex v2.0.2

  •  Tags:  
  • r
  • Related