Check to see if a series of multiple columns match values in a list-CodePudding

I have a dataframe (df1) that has ~100 columns that starts with "i10_pr". For each row, I would like to see if the values in any of the column that starts with "i10_pr" matches the values in another list (df2). And I would like to create another column that says 1 for yes and 0 for no. My desired output is something as follows

#Desired output
  visitlink visitorder i10_pr1   i10_pr2   i10_pr3   i10_pr4   i10_pr5   match
1   7466851          3 "BW28ZZZ" "BR30Y0Z" "BR39Y0Z" ""        ""        0
2   7023336          1 "0BDC8ZX" "0BDC8ZX" "07D78ZX" ""        ""        0
3   2481935          3 "5A09357" "3C1ZX8Z" "06HN33Z" "B54CZZA" "0W993ZX" 1
4   4605446          1 "5A1955Z" "0BH17EZ" "03HY32Z" "02HV33Z" "B548ZZA" 1
5   7287173          2 ""        ""        ""        ""        ""        0

#df2
     CODE
1 GZ58ZZZ
2 3C1ZX8Z
3 0BH17EZ
4 HZ89ZZZ
5 02HV33Z
6 HZ99ZZZ

I have also included toy codes:

#Df1
structure(list(visitlink = c(7466851, 7023336, 2481935, 4605446, 
7287173), visitorder = c(3L, 1L, 3L, 1L, 2L), i10_pr1 = c("BW28ZZZ", 
"0BDC8ZX", "5A09357", "5A1955Z", ""), i10_pr2 = c("BR30Y0Z", 
"0BDC8ZX", "3C1ZX8Z", "0BH17EZ", ""), i10_pr3 = c("BR39Y0Z", 
"07D78ZX", "06HN33Z", "03HY32Z", ""), i10_pr4 = c("", "", "B54CZZA", 
"02HV33Z", ""), i10_pr5 = c("", "", "0W993ZX", "B548ZZA", "")), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -5L), groups = structure(list(
    visitlink = c(2481935, 4605446, 7023336, 7287173, 7466851
    ), .rows = structure(list(3L, 4L, 2L, 5L, 1L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -5L), .drop = TRUE))

#Df2: list of values to match 
structure(list(CODE = structure(1:7, levels = c("GZ58ZZZ", "3C1ZX8Z", "0BH17EZ", "HZ89ZZZ", "02HV33Z", "HZ99ZZZ", "XW03351"), class = "factor")), row.names = c(NA, 6L), class = "data.frame")

CodePudding user response：

You can use str_detect to rowwise detect if any of the words from df2 c_across columns starting with "i10_pr" exist like in the following code:

#Df1
df1 <- structure(list(visitlink = c(7466851, 7023336, 2481935, 4605446, 
                             7287173), visitorder = c(3L, 1L, 3L, 1L, 2L), i10_pr1 = c("BW28ZZZ", 
                                                                                       "0BDC8ZX", "5A09357", "5A1955Z", ""), i10_pr2 = c("BR30Y0Z", 
                                                                                                                                         "0BDC8ZX", "3C1ZX8Z", "0BH17EZ", ""), i10_pr3 = c("BR39Y0Z", 
                                                                                                                                                                                           "07D78ZX", "06HN33Z", "03HY32Z", ""), i10_pr4 = c("", "", "B54CZZA", 
                                                                                                                                                                                                                                             "02HV33Z", ""), i10_pr5 = c("", "", "0W993ZX", "B548ZZA", "")), class = c("grouped_df", 
                                                                                                                                                                                                                                                                                                                       "tbl_df", "tbl", "data.frame"), row.names = c(NA, -5L), groups = structure(list(
                                                                                                                                                                                                                                                                                                                         visitlink = c(2481935, 4605446, 7023336, 7287173, 7466851
                                                                                                                                                                                                                                                                                                                         ), .rows = structure(list(3L, 4L, 2L, 5L, 1L), ptype = integer(0), class = c("vctrs_list_of", 
                                                                                                                                                                                                                                                                                                                                                                                                      "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
                                                                                                                                                                                                                                                                                                                                                                                                      ), row.names = c(NA, -5L), .drop = TRUE))

#Df2: list of values to match 
df2 <- structure(list(CODE = structure(1:7, levels = c("GZ58ZZZ", "3C1ZX8Z", "0BH17EZ", "HZ89ZZZ", "02HV33Z", "HZ99ZZZ", "XW03351"), class = "factor")), row.names = c(NA, 6L), class = "data.frame")

library(dplyr)
library(stringr)
df1 %>%
  rowwise() %>%
  mutate(match= any(str_detect(c_across(starts_with("i10_pr")), paste(c(df2$CODE),collapse = '|'))))
#> # A tibble: 5 × 8
#> # Rowwise:  visitlink
#>   visitlink visitorder i10_pr1   i10_pr2   i10_pr3   i10_pr4   i10_pr5   match
#>       <dbl>      <int> <chr>     <chr>     <chr>     <chr>     <chr>     <int>
#> 1   7466851          3 "BW28ZZZ" "BR30Y0Z" "BR39Y0Z" ""        ""            0
#> 2   7023336          1 "0BDC8ZX" "0BDC8ZX" "07D78ZX" ""        ""            0
#> 3   2481935          3 "5A09357" "3C1ZX8Z" "06HN33Z" "B54CZZA" "0W993ZX"     1
#> 4   4605446          1 "5A1955Z" "0BH17EZ" "03HY32Z" "02HV33Z" "B548ZZA"     1
#> 5   7287173          2 ""        ""        ""        ""        ""            0

^{Created on 2022-07-15 by the reprex package (v2.0.1)}

CodePudding user response：

You may loop over the codes using sapply and test them for equality with the entire 'i10_pr' subset. The simplify='array' gives us an "array" where we pass any through the third dimension using apply.

sapply(as.character(df2$CODE), `==`, df1[grep('i10_pr', names(df1))], simplify='array') |>
  apply(1, any) |> as.numeric()
# [1] 0 0 1 1 0

Just put it into transform to get the column.

transform(df1, match=sapply(as.character(df2$CODE), `==`, df1[grep('i10_pr', names(df1))], simplify='array') |>
            apply(1, any) |> as.numeric())
#   visitlink visitorder i10_pr1 i10_pr2 i10_pr3 i10_pr4 i10_pr5 match
# 1   7466851          3 BW28ZZZ BR30Y0Z BR39Y0Z                     0
# 2   7023336          1 0BDC8ZX 0BDC8ZX 07D78ZX                     0
# 3   2481935          3 5A09357 3C1ZX8Z 06HN33Z B54CZZA 0W993ZX     1
# 4   4605446          1 5A1955Z 0BH17EZ 03HY32Z 02HV33Z B548ZZA     1
# 5   7287173          2                                             0

Data:

df1 <- structure(list(visitlink = c(7466851, 7023336, 2481935, 4605446, 
7287173), visitorder = c(3L, 1L, 3L, 1L, 2L), i10_pr1 = c("BW28ZZZ", 
"0BDC8ZX", "5A09357", "5A1955Z", ""), i10_pr2 = c("BR30Y0Z", 
"0BDC8ZX", "3C1ZX8Z", "0BH17EZ", ""), i10_pr3 = c("BR39Y0Z", 
"07D78ZX", "06HN33Z", "03HY32Z", ""), i10_pr4 = c("", "", "B54CZZA", 
"02HV33Z", ""), i10_pr5 = c("", "", "0W993ZX", "B548ZZA", "")), class = "data.frame", row.names = c(NA, 
-5L), groups = structure(list(visitlink = c(2481935, 4605446, 
7023336, 7287173, 7466851), .rows = structure(list(3L, 4L, 2L, 
    5L, 1L), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", 
"list"))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-5L), .drop = TRUE))

df2 <- structure(list(CODE = structure(1:7, levels = c("GZ58ZZZ", "3C1ZX8Z", 
"0BH17EZ", "HZ89ZZZ", "02HV33Z", "HZ99ZZZ", "XW03351"), class = "factor")), class = "data.frame", row.names = c(NA, 
-7L))