I have a very large dataframe (a snippet of it below) that I would like to filter. I would only like to keep the rows if at least one entry in the columns that start with "i10_pr" start with any values I have in list
.
Here is an example of my dataframe and the list
#> ID visitorder i10_pr1 i10_pr2 i10_pr3 i10_pr4 i10_pr5 sum
#> 1 7466851 3 BW28ZZZ BR30Y0Z BR39Y0Z 0
#> 2 7023336 1 0BDC8ZX 0BDC8W7 07D78ZX 0
#> 3 2481935 3 5A09357 3C1ZX8Z 06HN33Z B54CZZA 0W993ZX 1
#> 4 4605446 1 5A1955Z 0BH17EZ 03HY32Z 02HV33Z GZ58ZZZ 3
#> 5 7287173 2
list <- c("0B", "ND", "3J", "BW", "BR") 0
The output would look like:
#> ID visitorder i10_pr1 i10_pr2 i10_pr3 i10_pr4 i10_pr5 sum
#> 1 7466851 3 BW28ZZZ BR30Y0Z BR39Y0Z 0
#> 2 7023336 1 0BDC8ZX 0BDC8W7 07D78ZX 0
#> 4 4605446 1 5A1955Z 0BH17EZ 03HY32Z 02HV33Z GZ58ZZZ 3
CodePudding user response:
We may create a regex pattern to filter
the columns that starts_with
'i10_pr'
using if_any
to find rows having at least one match
library(stringr)
library(dplyr)
v1 <- c("0B", "ND", "3J", "BW", "BR")
pat <- sprintf('^(%s)', str_c(v1, collapse = "|"))
df1 %>%
filter(if_any(starts_with("i10_pr"), ~ str_detect(.x, pat)))
-output
ID visitorder i10_pr1 i10_pr2 i10_pr3 i10_pr4 i10_pr5 sum
1 7466851 3 BW28ZZZ BR30Y0Z BR39Y0Z <NA> <NA> 0
2 7023336 1 0BDC8ZX 0BDC8W7 07D78ZX <NA> <NA> 0
4 4605446 1 5A1955Z 0BH17EZ 03HY32Z 02HV33Z GZ58ZZZ 3
Or extract the first two characters and use %in%
df1 %>%
filter(if_any(starts_with("i10_pr"), ~ substr(.x, 1, 2) %in% v1))
-output
ID visitorder i10_pr1 i10_pr2 i10_pr3 i10_pr4 i10_pr5 sum
1 7466851 3 BW28ZZZ BR30Y0Z BR39Y0Z <NA> <NA> 0
2 7023336 1 0BDC8ZX 0BDC8W7 07D78ZX <NA> <NA> 0
4 4605446 1 5A1955Z 0BH17EZ 03HY32Z 02HV33Z GZ58ZZZ 3
data
df1 <- structure(list(ID = c(7466851L, 7023336L, 2481935L, 4605446L),
visitorder = c(3L, 1L, 3L, 1L), i10_pr1 = c("BW28ZZZ", "0BDC8ZX",
"5A09357", "5A1955Z"), i10_pr2 = c("BR30Y0Z", "0BDC8W7",
"3C1ZX8Z", "0BH17EZ"), i10_pr3 = c("BR39Y0Z", "07D78ZX",
"06HN33Z", "03HY32Z"), i10_pr4 = c(NA, NA, "B54CZZA", "02HV33Z"
), i10_pr5 = c(NA, NA, "0W993ZX", "GZ58ZZZ"), sum = c(0L,
0L, 1L, 3L)), class = "data.frame", row.names = c("1", "2",
"3", "4"))