I have the following dataframe df
structure(list(Names = c("James", "Diana", "Dr. Michel", "Mr. George", "Hello world", "Janis","JJK"), code = c("xx34-2111", "7789-2869", "dg67-5555", "1555-2uuy", "fhgj-2jj8", "2199-7kk8","1111-1111"), `13/A1` = c(NA, "‡(2017)", NA, "‡(2017)", "A",
"A(2019)", "A(2016)"), `13/A2` = c(NA, "‡(2017)", NA, "‡(2017)",
"A", "A(2019)", "A(2016)"), `13/A3` = c(NA, "‡(2017)", NA, "‡(2017)",
"A", "A(2019)", "A(2016)"), `13/A4` = c(NA, "‡(2017)", NA, "‡(2017)",
"A", "A(2019)", "A(2016)"), `13/A5` = c(NA, "‡(2017)", NA, "‡(2017)",
"A", "A(2019)", "A(2016)"), `13/B1` = c("A", "A", "A(2017)",
"‡(2017)", "A", "A(2019)", NA), `13/B2` = c("A", "A", "A(2017)",
"‡(2017)", "A", "A(2019)", NA), `13/B3` = c("A", "A", "A(2017)",
"‡(2017)", "A", "A(2019)", NA), `13/B4` = c("A", "A", "A(2017)",
"‡(2017)", "A", "A(2019)", NA), `13/B5` = c("A", "A", "A(2017)",
"‡(2017)", "A", "A(2019)", NA), `13/C1` = c(NA, "A", NA, "‡(2017)",
"A", "A(2019)", NA), `13/D1` = c(NA, "‡(2017)", NA, "‡(2017)",
NA, "A", NA), `13/D2` = c(NA, "‡(2017)", NA, "‡(2017)", NA, "A",
NA), `13/D3` = c(NA, "‡(2017)", NA, "‡(2017)", NA, "A", "A(2017)"
), `13/D4` = c("A", "‡(2017)", NA, "‡(2017)", NA, "A", NA)), row.names = c(1L,
6L, 11L, 39L, 101L, 483L, 667L), class = "data.frame")
I would like to keep only the rows of df
which contain in at least one of the columns 3:17
, any of the following entries: A
, A(2016)
, A(2017)
, A(2019)
.
For the example above, this means to delete only the row containing Mr. George
in the field Names
.
My attempt
df[,3:17]%in% c("A","A(2016)","A(2017)","A(2019)")
does not give the desired result.
CodePudding user response:
Loop through columns - sapply
, check if values matches - %in%
, then check if any column TRUE per row - rowSums(...) > 0
df[ rowSums(sapply(df[, 3:17], function(i) i %in% c("A","A(2016)","A(2017)","A(2019)"))) > 0, ]
CodePudding user response:
Using grepl
> rowSums(matrix(grepl("^A$|A\\(2019\\)|A\\(2017\\)|A\\(2016\\)",unlist(df)),nrow(df)))>0
[1] TRUE TRUE TRUE FALSE TRUE TRUE TRUE
CodePudding user response:
With a fully dplyr
approach:
library(dplyr)
df %>%
filter(if_any(3:17, ~ .x %in% c("A","A(2016)","A(2017)","A(2019)")))
#> Names code 13/A1 13/A2 13/A3 13/A4 13/A5 13/B1 13/B2
#> 1 James xx34-2111 <NA> <NA> <NA> <NA> <NA> A A
#> 2 Diana 7789-2869 ‡(2017) ‡(2017) ‡(2017) ‡(2017) ‡(2017) A A
#> 3 Dr. Michel dg67-5555 <NA> <NA> <NA> <NA> <NA> A(2017) A(2017)
#> 4 Hello world fhgj-2jj8 A A A A A A A
#> 5 Janis 2199-7kk8 A(2019) A(2019) A(2019) A(2019) A(2019) A(2019) A(2019)
#> 6 JJK 1111-1111 A(2016) A(2016) A(2016) A(2016) A(2016) <NA> <NA>
#> 13/B3 13/B4 13/B5 13/C1 13/D1 13/D2 13/D3 13/D4
#> 1 A A A <NA> <NA> <NA> <NA> A
#> 2 A A A A ‡(2017) ‡(2017) ‡(2017) ‡(2017)
#> 3 A(2017) A(2017) A(2017) <NA> <NA> <NA> <NA> <NA>
#> 4 A A A A <NA> <NA> <NA> <NA>
#> 5 A(2019) A(2019) A(2019) A(2019) A A A A
#> 6 <NA> <NA> <NA> <NA> <NA> <NA> A(2017) <NA>
Another possible solution:
library(magrittr)
df[((df[,3:17] == "A") (df[,3:17] == "A(2016)")
(df[,3:17] == "A(2017)") (df[,3:17] == "A(2019)")) %>%
apply(1, \(x) sum(x[3:17], na.rm = T)) != 0, ]
#> Names code 13/A1 13/A2 13/A3 13/A4 13/A5 13/B1
#> 1 James xx34-2111 <NA> <NA> <NA> <NA> <NA> A
#> 6 Diana 7789-2869 ‡(2017) ‡(2017) ‡(2017) ‡(2017) ‡(2017) A
#> 11 Dr. Michel dg67-5555 <NA> <NA> <NA> <NA> <NA> A(2017)
#> 101 Hello world fhgj-2jj8 A A A A A A
#> 483 Janis 2199-7kk8 A(2019) A(2019) A(2019) A(2019) A(2019) A(2019)
#> 667 JJK 1111-1111 A(2016) A(2016) A(2016) A(2016) A(2016) <NA>
#> 13/B2 13/B3 13/B4 13/B5 13/C1 13/D1 13/D2 13/D3 13/D4
#> 1 A A A A <NA> <NA> <NA> <NA> A
#> 6 A A A A A ‡(2017) ‡(2017) ‡(2017) ‡(2017)
#> 11 A(2017) A(2017) A(2017) A(2017) <NA> <NA> <NA> <NA> <NA>
#> 101 A A A A A <NA> <NA> <NA> <NA>
#> 483 A(2019) A(2019) A(2019) A(2019) A(2019) A A A A
#> 667 <NA> <NA> <NA> <NA> <NA> <NA> <NA> A(2017) <NA>