I have a very large dataframe with character values. I want to compare the rows to each other and create IDs based on the comparison. The problem is that there are NA in the df and that I want those be evaluated as matching any value. The other issue is that the IDs need to be created as well in the same step (or I'm thinking about the problem in a too complicated way).
Here's the toy df I created:
library(tidyverse)
library(purrr)
# make toy df
Set1 <- c("A", "B", "C","A")
Set2 <- c("A", "D", "B", "A")
Set3 <- c(NA, "B", "C", "A")
Set4 <- c("A", "G", "B", "A")
Set5 <- c("F", "G", NA, "F")
Set6 <- c("A", "B", "C", "C")
sets <- rbind(Set1, Set2, Set3, Set4, Set5, Set6)
colnames(sets) <- c("Var1", "Var2", "Var3", "Var4")
sets
Var1 Var2 Var3 Var4
Set1 "A" "B" "C" "A"
Set2 "A" "D" "B" "A"
Set3 NA "B" "C" "A"
Set4 "A" "D" "B" "A"
Set5 "F" "G" NA "F"
Set6 "A" "B" "C" "C"
And here's the desired output, either as a separate df or as a new column, either one would be just as good:
# as new column
Var1 Var2 Var3 Var4 COMP
Set1 "A" "B" "C" "A" "Group1"
Set2 "A" "D" "B" "A" "Group2
Set3 NA "B" "C" "A" "Group1"
Set4 "A" "D" "B" "A" "Group3"
Set5 "F" "G" NA "F" "Group4"
Set6 "A" "B" "C" "C" "Group5"
# as new df
COMP
Set1 "Group1"
Set2 "Group2
Set3 "Group1"
Set4 "Group3"
Set5 "Group4"
Set6 "Group5"
I'm thinking this can be achieved with rowwise()
and map
, but even after reading similar questions I cannot figure out exactly how to achieve this, especially how to name the new groups consecutively and consistently. Any ideas would be much appreciated.
CodePudding user response:
A very ugly while
loop solution but I think it works.
#Change sets to dataframe
sets <- data.frame(sets)
result <- integer(nrow(sets))
group_count <- 1
x <- 1
while(any(result == 0)) {
a <- sets[-x, !is.na(sets[x, ])]
b <- na.omit(unlist(sets[x, ]))
inds <- which(rowSums(sweep(a, 2, as.matrix(b), `==`), na.rm = TRUE) == length(b))
#If a complete match is found
if(length(inds)) {
#Need to adjust the match since we are dropping 1 row from original df
if(all(inds > x)) {
result[c(x, inds 1)] <- group_count
} else {
result[c(x, inds)] <- group_count
}
} else {
result[x] <- group_count
}
group_count <- group_count 1
#Get the next row number to check.
x <- which(result == 0)[1]
}
#Reset so you get counts in order 1, 2, 3...
result <- match(result, unique(result))
result
[1] 1 2 1 2 3 4
The logic here is to compare every row value with every other row in the dataframe dropping their NA
values and if there is a match we update the rows with group_count
value.
CodePudding user response:
You can do some fuzzy joining after creating group ids:
library(tidyverse)
library(fuzzyjoin)
library(stringdist)
#>
#> Attaching package: 'stringdist'
#> The following object is masked from 'package:tidyr':
#>
#> extract
# make toy df
Set1 <- c("A", "B", "C","A")
Set2 <- c("A", "D", "B", "A")
Set3 <- c(NA, "B", "C", "A")
Set4 <- c("A", "D", "B", "A")
Set5 <- c("F", "G", NA, "F")
Set6 <- c("A", "B", "C", "C")
sets <- rbind(Set1, Set2, Set3, Set4, Set5, Set6)
colnames(sets) <- c("Var1", "Var2", "Var3", "Var4")
sets
#> Var1 Var2 Var3 Var4
#> Set1 "A" "B" "C" "A"
#> Set2 "A" "D" "B" "A"
#> Set3 NA "B" "C" "A"
#> Set4 "A" "D" "B" "A"
#> Set5 "F" "G" NA "F"
#> Set6 "A" "B" "C" "C"
elements <-
sets %>%
as_tibble() %>%
pivot_longer(everything()) %>%
pull(value) %>%
unique() %>%
discard(is.na)
elements
#> [1] "A" "B" "C" "D" "F" "G"
groups <-
expand_grid(
Var1 = elements,
Var2 = elements,
Var3 = elements,
Var4 = elements
) %>%
mutate(group = row_number() %>% paste0("Group", .)) %>%
unite(group_str, starts_with("Var"))
groups
#> # A tibble: 1,296 × 2
#> group_str group
#> <chr> <chr>
#> 1 A_A_A_A Group1
#> 2 A_A_A_B Group2
#> 3 A_A_A_C Group3
#> 4 A_A_A_D Group4
#> 5 A_A_A_F Group5
#> 6 A_A_A_G Group6
#> 7 A_A_B_A Group7
#> 8 A_A_B_B Group8
#> 9 A_A_B_C Group9
#> 10 A_A_B_D Group10
#> # … with 1,286 more rows
match iff strings x and y are exact
but also allow one char off iff there is one #
compare <- function(x, y) {
(
stringdist(x, y) <= 1 & paste0(x, y) %>% str_count("#") == 1
) |
(
x == y
)
}
sets %>%
as_tibble(rownames = "set") %>%
mutate_all(~ .x %>% replace_na("#")) %>%
unite(group_str, starts_with("Var")) %>%
fuzzy_left_join(groups, match_fun = compare)
#> Joining by: "group_str"
#> # A tibble: 16 × 4
#> set group_str.x group_str.y group
#> <chr> <chr> <chr> <chr>
#> 1 Set1 A_B_C_A A_B_C_A Group49
#> 2 Set2 A_D_B_A A_D_B_A Group115
#> 3 Set3 #_B_C_A A_B_C_A Group49
#> 4 Set3 #_B_C_A B_B_C_A Group265
#> 5 Set3 #_B_C_A C_B_C_A Group481
#> 6 Set3 #_B_C_A D_B_C_A Group697
#> 7 Set3 #_B_C_A F_B_C_A Group913
#> 8 Set3 #_B_C_A G_B_C_A Group1129
#> 9 Set4 A_D_B_A A_D_B_A Group115
#> 10 Set5 F_G_#_F F_G_A_F Group1049
#> 11 Set5 F_G_#_F F_G_B_F Group1055
#> 12 Set5 F_G_#_F F_G_C_F Group1061
#> 13 Set5 F_G_#_F F_G_D_F Group1067
#> 14 Set5 F_G_#_F F_G_F_F Group1073
#> 15 Set5 F_G_#_F F_G_G_F Group1079
#> 16 Set6 A_B_C_C A_B_C_C Group51
Created on 2021-09-25 by the reprex package (v2.0.1)
CodePudding user response:
You could replace the NA
with a .
, paste into a string and pattern match using grepl()
.
library(magrittr)
sets <- as.data.frame(sets)
sets %>%
replace(is.na(sets), ".") %>%
do.call(paste0, .) %>%
outer(., ., function(x, y) mapply(grepl, x, y)) %>%
t() %>%
max.col(ties.method = "last") %>%
match(unique(.))
[1] 1 2 1 2 3 4
But it's possible treating NA
s as wild will match multiple rows so it may be safer to do:
# Change Row 6 so both Row 6 and Row 1 match Row 3
Set6 <- c("B", "B", "C", "A")
sets %>%
replace(is.na(sets), ".") %>%
do.call(paste0, .) %>%
outer(., ., function(x, y) mapply(grepl, x, y)) %>%
apply(2, which)
[[1]]
[1] 1 3
[[2]]
[1] 2 4
[[3]]
[1] 3
[[4]]
[1] 2 4
[[5]]
[1] 5
[[6]]
[1] 3 6
This tells which row is a match for which other row (including itself).