Compare character rows of a df matching NA to everything and create new column or df based on compar-CodePudding

I have a very large dataframe with character values. I want to compare the rows to each other and create IDs based on the comparison. The problem is that there are NA in the df and that I want those be evaluated as matching any value. The other issue is that the IDs need to be created as well in the same step (or I'm thinking about the problem in a too complicated way).

Here's the toy df I created:

library(tidyverse)
library(purrr)

# make toy df
Set1 <- c("A", "B", "C","A")
Set2 <- c("A", "D", "B", "A")
Set3 <- c(NA, "B", "C", "A")
Set4 <- c("A", "G", "B", "A")
Set5 <- c("F", "G", NA, "F")
Set6 <- c("A", "B", "C", "C")
sets <- rbind(Set1, Set2, Set3, Set4, Set5, Set6)
colnames(sets) <- c("Var1", "Var2", "Var3", "Var4")
sets

     Var1 Var2 Var3 Var4
Set1 "A"  "B"  "C"  "A" 
Set2 "A"  "D"  "B"  "A" 
Set3 NA   "B"  "C"  "A" 
Set4 "A"  "D"  "B"  "A" 
Set5 "F"  "G"  NA   "F" 
Set6 "A"  "B"  "C"  "C"

And here's the desired output, either as a separate df or as a new column, either one would be just as good:

# as new column
    Var1 Var2 Var3 Var4 COMP
Set1 "A"  "B"  "C"  "A" "Group1"
Set2 "A"  "D"  "B"  "A" "Group2
Set3 NA   "B"  "C"  "A" "Group1"
Set4 "A"  "D"  "B"  "A" "Group3"
Set5 "F"  "G"  NA   "F" "Group4"
Set6 "A"  "B"  "C"  "C" "Group5"

# as new df
      COMP
Set1 "Group1"
Set2 "Group2
Set3 "Group1"
Set4 "Group3"
Set5 "Group4"
Set6 "Group5"

I'm thinking this can be achieved with rowwise() and map, but even after reading similar questions I cannot figure out exactly how to achieve this, especially how to name the new groups consecutively and consistently. Any ideas would be much appreciated.

CodePudding user response：

A very ugly while loop solution but I think it works.

#Change sets to dataframe
sets <- data.frame(sets)
result <- integer(nrow(sets))
group_count <- 1
x <- 1

while(any(result == 0)) {
  a <- sets[-x, !is.na(sets[x, ])]
  b <- na.omit(unlist(sets[x, ]))
  inds <- which(rowSums(sweep(a, 2, as.matrix(b), `==`), na.rm = TRUE) == length(b))
  #If a complete match is found
  if(length(inds)) {
    #Need to adjust the match since we are dropping 1 row from original df
    if(all(inds > x)) {
      result[c(x, inds   1)] <- group_count  
    } else {
      result[c(x, inds)] <- group_count  
    }
  } else {
    result[x] <- group_count
  }
  group_count <- group_count   1
  #Get the next row number to check. 
  x <- which(result == 0)[1]
}

#Reset so you get counts in order 1, 2, 3...
result <- match(result, unique(result))
result
[1] 1 2 1 2 3 4

The logic here is to compare every row value with every other row in the dataframe dropping their NA values and if there is a match we update the rows with group_count value.

CodePudding user response：

You can do some fuzzy joining after creating group ids:

library(tidyverse)
library(fuzzyjoin)
library(stringdist)
#> 
#> Attaching package: 'stringdist'
#> The following object is masked from 'package:tidyr':
#> 
#>     extract

# make toy df
Set1 <- c("A", "B", "C","A")
Set2 <- c("A", "D", "B", "A")
Set3 <- c(NA, "B", "C", "A")
Set4 <- c("A", "D", "B", "A")
Set5 <- c("F", "G", NA, "F")
Set6 <- c("A", "B", "C", "C")
sets <- rbind(Set1, Set2, Set3, Set4, Set5, Set6)
colnames(sets) <- c("Var1", "Var2", "Var3", "Var4")
sets
#>      Var1 Var2 Var3 Var4
#> Set1 "A"  "B"  "C"  "A" 
#> Set2 "A"  "D"  "B"  "A" 
#> Set3 NA   "B"  "C"  "A" 
#> Set4 "A"  "D"  "B"  "A" 
#> Set5 "F"  "G"  NA   "F" 
#> Set6 "A"  "B"  "C"  "C"

elements <-
  sets %>%
  as_tibble() %>%
  pivot_longer(everything()) %>%
  pull(value) %>%
  unique() %>%
  discard(is.na)
elements
#> [1] "A" "B" "C" "D" "F" "G"

groups <-
  expand_grid(
    Var1 = elements,
    Var2 = elements,
    Var3 = elements,
    Var4 = elements
  ) %>%
    mutate(group = row_number() %>% paste0("Group", .)) %>%
    unite(group_str, starts_with("Var"))
groups
#> # A tibble: 1,296 × 2
#>    group_str group  
#>    <chr>     <chr>  
#>  1 A_A_A_A   Group1 
#>  2 A_A_A_B   Group2 
#>  3 A_A_A_C   Group3 
#>  4 A_A_A_D   Group4 
#>  5 A_A_A_F   Group5 
#>  6 A_A_A_G   Group6 
#>  7 A_A_B_A   Group7 
#>  8 A_A_B_B   Group8 
#>  9 A_A_B_C   Group9 
#> 10 A_A_B_D   Group10
#> # … with 1,286 more rows

match iff strings x and y are exact but also allow one char off iff there is one #

compare <- function(x, y) {
    (
      stringdist(x, y) <= 1 & paste0(x, y) %>% str_count("#") == 1
    ) |
    (
      x == y
    )
}

sets %>%
  as_tibble(rownames = "set") %>%
  mutate_all(~ .x %>% replace_na("#")) %>%
  unite(group_str, starts_with("Var")) %>%
  fuzzy_left_join(groups, match_fun = compare)
#> Joining by: "group_str"
#> # A tibble: 16 × 4
#>    set   group_str.x group_str.y group    
#>    <chr> <chr>       <chr>       <chr>    
#>  1 Set1  A_B_C_A     A_B_C_A     Group49  
#>  2 Set2  A_D_B_A     A_D_B_A     Group115 
#>  3 Set3  #_B_C_A     A_B_C_A     Group49  
#>  4 Set3  #_B_C_A     B_B_C_A     Group265 
#>  5 Set3  #_B_C_A     C_B_C_A     Group481 
#>  6 Set3  #_B_C_A     D_B_C_A     Group697 
#>  7 Set3  #_B_C_A     F_B_C_A     Group913 
#>  8 Set3  #_B_C_A     G_B_C_A     Group1129
#>  9 Set4  A_D_B_A     A_D_B_A     Group115 
#> 10 Set5  F_G_#_F     F_G_A_F     Group1049
#> 11 Set5  F_G_#_F     F_G_B_F     Group1055
#> 12 Set5  F_G_#_F     F_G_C_F     Group1061
#> 13 Set5  F_G_#_F     F_G_D_F     Group1067
#> 14 Set5  F_G_#_F     F_G_F_F     Group1073
#> 15 Set5  F_G_#_F     F_G_G_F     Group1079
#> 16 Set6  A_B_C_C     A_B_C_C     Group51

^{Created on 2021-09-25 by the reprex package (v2.0.1)}

CodePudding user response：

You could replace the NA with a ., paste into a string and pattern match using grepl().

library(magrittr)

sets <- as.data.frame(sets)

sets %>%
  replace(is.na(sets), ".") %>%
  do.call(paste0, .) %>%
  outer(., ., function(x, y) mapply(grepl, x, y)) %>%
  t() %>%
  max.col(ties.method = "last") %>%
  match(unique(.))

[1] 1 2 1 2 3 4

But it's possible treating NAs as wild will match multiple rows so it may be safer to do:

# Change Row 6 so both Row 6 and Row 1 match Row 3
Set6 <- c("B", "B", "C", "A")

sets %>%
  replace(is.na(sets), ".") %>%
  do.call(paste0, .) %>%
  outer(., ., function(x, y) mapply(grepl, x, y)) %>%
  apply(2, which)

[[1]]
[1] 1 3

[[2]]
[1] 2 4

[[3]]
[1] 3

[[4]]
[1] 2 4

[[5]]
[1] 5

[[6]]
[1] 3 6

This tells which row is a match for which other row (including itself).