I have a dataframe (df1) that has ~100 columns that starts with "i10_pr". For each row, I would like to see if the values in any of the column that starts with "i10_pr" matches the values in another list (df2). And I would like to create another column that says 1 for yes and 0 for no. My desired output is something as follows
#Desired output
visitlink visitorder i10_pr1 i10_pr2 i10_pr3 i10_pr4 i10_pr5 match
1 7466851 3 "BW28ZZZ" "BR30Y0Z" "BR39Y0Z" "" "" 0
2 7023336 1 "0BDC8ZX" "0BDC8ZX" "07D78ZX" "" "" 0
3 2481935 3 "5A09357" "3C1ZX8Z" "06HN33Z" "B54CZZA" "0W993ZX" 1
4 4605446 1 "5A1955Z" "0BH17EZ" "03HY32Z" "02HV33Z" "B548ZZA" 1
5 7287173 2 "" "" "" "" "" 0
#df2
CODE
1 GZ58ZZZ
2 3C1ZX8Z
3 0BH17EZ
4 HZ89ZZZ
5 02HV33Z
6 HZ99ZZZ
I have also included toy codes:
#Df1
structure(list(visitlink = c(7466851, 7023336, 2481935, 4605446,
7287173), visitorder = c(3L, 1L, 3L, 1L, 2L), i10_pr1 = c("BW28ZZZ",
"0BDC8ZX", "5A09357", "5A1955Z", ""), i10_pr2 = c("BR30Y0Z",
"0BDC8ZX", "3C1ZX8Z", "0BH17EZ", ""), i10_pr3 = c("BR39Y0Z",
"07D78ZX", "06HN33Z", "03HY32Z", ""), i10_pr4 = c("", "", "B54CZZA",
"02HV33Z", ""), i10_pr5 = c("", "", "0W993ZX", "B548ZZA", "")), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -5L), groups = structure(list(
visitlink = c(2481935, 4605446, 7023336, 7287173, 7466851
), .rows = structure(list(3L, 4L, 2L, 5L, 1L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -5L), .drop = TRUE))
#Df2: list of values to match
structure(list(CODE = structure(1:7, levels = c("GZ58ZZZ", "3C1ZX8Z", "0BH17EZ", "HZ89ZZZ", "02HV33Z", "HZ99ZZZ", "XW03351"), class = "factor")), row.names = c(NA, 6L), class = "data.frame")
CodePudding user response:
You can use str_detect
to rowwise
detect if any
of the words from df2 c_across
columns starting with "i10_pr" exist like in the following code:
#Df1
df1 <- structure(list(visitlink = c(7466851, 7023336, 2481935, 4605446,
7287173), visitorder = c(3L, 1L, 3L, 1L, 2L), i10_pr1 = c("BW28ZZZ",
"0BDC8ZX", "5A09357", "5A1955Z", ""), i10_pr2 = c("BR30Y0Z",
"0BDC8ZX", "3C1ZX8Z", "0BH17EZ", ""), i10_pr3 = c("BR39Y0Z",
"07D78ZX", "06HN33Z", "03HY32Z", ""), i10_pr4 = c("", "", "B54CZZA",
"02HV33Z", ""), i10_pr5 = c("", "", "0W993ZX", "B548ZZA", "")), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -5L), groups = structure(list(
visitlink = c(2481935, 4605446, 7023336, 7287173, 7466851
), .rows = structure(list(3L, 4L, 2L, 5L, 1L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -5L), .drop = TRUE))
#Df2: list of values to match
df2 <- structure(list(CODE = structure(1:7, levels = c("GZ58ZZZ", "3C1ZX8Z", "0BH17EZ", "HZ89ZZZ", "02HV33Z", "HZ99ZZZ", "XW03351"), class = "factor")), row.names = c(NA, 6L), class = "data.frame")
library(dplyr)
library(stringr)
df1 %>%
rowwise() %>%
mutate(match= any(str_detect(c_across(starts_with("i10_pr")), paste(c(df2$CODE),collapse = '|'))))
#> # A tibble: 5 × 8
#> # Rowwise: visitlink
#> visitlink visitorder i10_pr1 i10_pr2 i10_pr3 i10_pr4 i10_pr5 match
#> <dbl> <int> <chr> <chr> <chr> <chr> <chr> <int>
#> 1 7466851 3 "BW28ZZZ" "BR30Y0Z" "BR39Y0Z" "" "" 0
#> 2 7023336 1 "0BDC8ZX" "0BDC8ZX" "07D78ZX" "" "" 0
#> 3 2481935 3 "5A09357" "3C1ZX8Z" "06HN33Z" "B54CZZA" "0W993ZX" 1
#> 4 4605446 1 "5A1955Z" "0BH17EZ" "03HY32Z" "02HV33Z" "B548ZZA" 1
#> 5 7287173 2 "" "" "" "" "" 0
Created on 2022-07-15 by the reprex package (v2.0.1)
CodePudding user response:
You may loop over the codes using sapply
and test them for equality with the entire 'i10_pr'
subset. The simplify='array'
gives us an "array"
where we pass any
through the third dimension using apply
.
sapply(as.character(df2$CODE), `==`, df1[grep('i10_pr', names(df1))], simplify='array') |>
apply(1, any) |> as.numeric()
# [1] 0 0 1 1 0
Just put it into transform
to get the column.
transform(df1, match=sapply(as.character(df2$CODE), `==`, df1[grep('i10_pr', names(df1))], simplify='array') |>
apply(1, any) |> as.numeric())
# visitlink visitorder i10_pr1 i10_pr2 i10_pr3 i10_pr4 i10_pr5 match
# 1 7466851 3 BW28ZZZ BR30Y0Z BR39Y0Z 0
# 2 7023336 1 0BDC8ZX 0BDC8ZX 07D78ZX 0
# 3 2481935 3 5A09357 3C1ZX8Z 06HN33Z B54CZZA 0W993ZX 1
# 4 4605446 1 5A1955Z 0BH17EZ 03HY32Z 02HV33Z B548ZZA 1
# 5 7287173 2 0
Data:
df1 <- structure(list(visitlink = c(7466851, 7023336, 2481935, 4605446,
7287173), visitorder = c(3L, 1L, 3L, 1L, 2L), i10_pr1 = c("BW28ZZZ",
"0BDC8ZX", "5A09357", "5A1955Z", ""), i10_pr2 = c("BR30Y0Z",
"0BDC8ZX", "3C1ZX8Z", "0BH17EZ", ""), i10_pr3 = c("BR39Y0Z",
"07D78ZX", "06HN33Z", "03HY32Z", ""), i10_pr4 = c("", "", "B54CZZA",
"02HV33Z", ""), i10_pr5 = c("", "", "0W993ZX", "B548ZZA", "")), class = "data.frame", row.names = c(NA,
-5L), groups = structure(list(visitlink = c(2481935, 4605446,
7023336, 7287173, 7466851), .rows = structure(list(3L, 4L, 2L,
5L, 1L), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr",
"list"))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-5L), .drop = TRUE))
df2 <- structure(list(CODE = structure(1:7, levels = c("GZ58ZZZ", "3C1ZX8Z",
"0BH17EZ", "HZ89ZZZ", "02HV33Z", "HZ99ZZZ", "XW03351"), class = "factor")), class = "data.frame", row.names = c(NA,
-7L))