I have the following dataframe
dput(head(df,14))
structure(list(A1 = c("02:01:01", "02:01:01", "02:01:01", "23",
"29:02:01", "02:01:01", "03:01:01", "32:01:01", "26", "03:02:01",
"11:01:01", "02:01:01", "02:01:01", "02:01:01"), B1 = c("15:01:01",
"44:02:01", "15:01:01", "38", "44:03:01", "44:02:01", "27:05:02",
"27:05:02", "41", "08:01:01", "40:01:02", "15:01:01", "07:02:01",
"35:12:01"), C1 = c("03:03:01", "05:01:01", "01:02:01", "12",
"16:01:01", "05:01:01", "07:02:01", "12:03:01", "17", "03:04:01",
"07:01:01", "03:04:01", "05:01:01", "04:01:01"), D1 = c("04",
"04", "01", "11", "14", "11", "09", "01", "13", "<NA>", "<NA>",
"01:01:01", "01:01:01", "08:02:01"), E1 = c("02", "02", "<NA>",
"02", "<NA>", "<NA>", "<NA>", "<NA>", "03", "<NA>", "<NA>", "02:02:01",
"<NA>", "<NA>"), F1 = c("<NA>", "<NA>", "<NA>", "<NA>", "01",
"01", "01:03:02", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>",
"<NA>"), G1 = c("<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>",
"<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "01:01:01", "<NA>"
), H1 = c("01:04:01", "03:03:01", "01:01:01", "01", "03:03:01",
"05:05:01", "05:01:01", "01:01:01", "05", "03:01:01", "05:01:01",
"01:01:01", "01:01:01", "04:01:01"), I1 = c("03:02:01", "03:01:01",
"05:01:01", "03", "05:03:01", "03:01:01", "03:03:02", "05:01:01",
"06", "02:01:01", "03:02:01", "03:01:01", "05:01:01", "04:02:01"
), J1 = c("01:03:01", "01:03:01", "01:03:01", "01", "01:03:01",
"01:03:01", "02:01:02", "01:03:01", "02", "01:03:01", "02:01:02",
"01:03:01", "01:03:01", "01:03:01"), K1 = c("02:01", "04:01:01",
"03:01", "04:01", "03:01", "23:01:01", "02:01", "04:02:01", "13:01",
"01:01", "03:01", "03:01", "04:01:01", "04:01")), row.names = c(NA,
14L), class = "data.frame")
I want to extract rows on the following conditions:
Rows that have the values without
:
separator in any column like thisstructure(list(A1 = c("02:01:01", "02:01:01", "02:01:01", "23", "29:02:01", "02:01:01", "03:01:01", "32:01:01", "26"), B1 = c("15:01:01", "44:02:01", "15:01:01", "38", "44:03:01", "44:02:01", "27:05:02", "27:05:02", "41"), C1 = c("03:03:01", "05:01:01", "01:02:01", "12", "16:01:01", "05:01:01", "07:02:01", "12:03:01", "17"), D1 = c(4L, 4L, 1L, 11L, 14L, 11L, 9L, 1L, 13L), E1 = c("02", "02", "", "02", "", "", "", "", "03"), F1 = c("", "", "", "", "01", "01", "01:03:02", "", ""), G1 = c("", "", "", "", "", "", "", "", ""), H1 = c("01:04:01", "03:03:01", "01:01:01", "01", "03:03:01", "05:05:01", "05:01:01", "01:01:01", "05"), I1 = c("03:02:01", "03:01:01", "05:01:01", "03", "05:03:01", "03:01:01", "03:03:02", "05:01:01", "06"), J1 = c("01:03:01", "01:03:01", "01:03:01", "01", "01:03:01", "01:03:01", "02:01:02", "01:03:01", "02"), K1 = c("02:01", "04:01:01", "03:01", "04:01", "03:01", "23:01:01", "02:01", "04:02:01", "13:01")), row.names = c(NA, 9L), class = "data.frame")
Rows which has the values with
:
separator only lie this..structure(list(A1 = c("03:02:01", "11:01:01", "02:01:01", "02:01:01", "02:01:01"), B1 = c("08:01:01", "40:01:02", "15:01:01", "07:02:01", "35:12:01"), C1 = c("03:04:01", "07:01:01", "03:04:01", "05:01:01", "04:01:01"), D1 = c("", "", "01:01:01", "01:01:01", "08:02:01" ), E1 = c("", "", "02:02:01", "", ""), F1 = c("", "", "", "", ""), G1 = c("", "", "", "01:01:01", ""), H1 = c("03:01:01", "05:01:01", "01:01:01", "01:01:01", "04:01:01"), I1 = c("02:01:01", "03:02:01", "03:01:01", "05:01:01", "04:02:01"), J1 = c("01:03:01", "02:01:02", "01:03:01", "01:03:01", "01:03:01"), K1 = c("01:01", "03:01", "03:01", "04:01:01", "04:01")), row.names = c(NA, 5L), class = "data.frame")
This is what I have tried so far:
df<- with(df, df[ !grepl( ':', A1) | !grepl( ':', B1) | !grepl( ':', C1) | !grepl( ':', D1),])
However, this extracts rows with NA
, which also does not have the :
separator.
CodePudding user response:
Here is one approach using dplyr
.
You can filter
checking all columns to contain either the colon (:) or character value of "NA" (note in your example, they are character values and not NA
symbol). Here you can use if_all
or across
:
library(dplyr)
df %>%
filter(if_all(everything(), ~ grepl(":|<NA>", .)))
Output
A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1
1 03:02:01 08:01:01 03:04:01 <NA> <NA> <NA> <NA> 03:01:01 02:01:01 01:03:01 01:01
2 11:01:01 40:01:02 07:01:01 <NA> <NA> <NA> <NA> 05:01:01 03:02:01 02:01:02 03:01
3 02:01:01 15:01:01 03:04:01 01:01:01 02:02:01 <NA> <NA> 01:01:01 03:01:01 01:03:01 03:01
4 02:01:01 07:02:01 05:01:01 01:01:01 <NA> <NA> 01:01:01 01:01:01 05:01:01 01:03:01 04:01:01
5 02:01:01 35:12:01 04:01:01 08:02:01 <NA> <NA> <NA> 04:01:01 04:02:01 01:03:01 04:01
To get rows where there are characters without the colon, you can use if_any
:
df %>%
filter(if_any(everything(), ~ !grepl(":|<NA>", .)))
Output
A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1
1 02:01:01 15:01:01 03:03:01 04 02 <NA> <NA> 01:04:01 03:02:01 01:03:01 02:01
2 02:01:01 44:02:01 05:01:01 04 02 <NA> <NA> 03:03:01 03:01:01 01:03:01 04:01:01
3 02:01:01 15:01:01 01:02:01 01 <NA> <NA> <NA> 01:01:01 05:01:01 01:03:01 03:01
4 23 38 12 11 02 <NA> <NA> 01 03 01 04:01
5 29:02:01 44:03:01 16:01:01 14 <NA> 01 <NA> 03:03:01 05:03:01 01:03:01 03:01
6 02:01:01 44:02:01 05:01:01 11 <NA> 01 <NA> 05:05:01 03:01:01 01:03:01 23:01:01
7 03:01:01 27:05:02 07:02:01 09 <NA> 01:03:02 <NA> 05:01:01 03:03:02 02:01:02 02:01
8 32:01:01 27:05:02 12:03:01 01 <NA> <NA> <NA> 01:01:01 05:01:01 01:03:01 04:02:01
9 26 41 17 13 03 <NA> <NA> 05 06 02 13:01
Finally, if your real data has NA
symbol and not character values containing "NA", then you can try:
df %>%
filter(if_all(everything(), ~ grepl(":", .) | is.na(.)))
and
df %>%
filter(if_any(everything(), ~ !(grepl(":", .) | is.na(.))))
CodePudding user response:
An alternative to across()
is to use map/apply
functions.
library(tidyverse)
# DATA --------------------------------------------------------------------
df <-
structure(list(A1 = c("02:01:01", "02:01:01", "02:01:01", "23",
"29:02:01", "02:01:01", "03:01:01", "32:01:01", "26", "03:02:01",
"11:01:01", "02:01:01", "02:01:01", "02:01:01"), B1 = c("15:01:01",
"44:02:01", "15:01:01", "38", "44:03:01", "44:02:01", "27:05:02",
"27:05:02", "41", "08:01:01", "40:01:02", "15:01:01", "07:02:01",
"35:12:01"), C1 = c("03:03:01", "05:01:01", "01:02:01", "12",
"16:01:01", "05:01:01", "07:02:01", "12:03:01", "17", "03:04:01",
"07:01:01", "03:04:01", "05:01:01", "04:01:01"), D1 = c("04",
"04", "01", "11", "14", "11", "09", "01", "13", "<NA>", "<NA>",
"01:01:01", "01:01:01", "08:02:01"), E1 = c("02", "02", "<NA>",
"02", "<NA>", "<NA>", "<NA>", "<NA>", "03", "<NA>", "<NA>", "02:02:01",
"<NA>", "<NA>"), F1 = c("<NA>", "<NA>", "<NA>", "<NA>", "01",
"01", "01:03:02", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>",
"<NA>"), G1 = c("<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>",
"<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "01:01:01", "<NA>"
), H1 = c("01:04:01", "03:03:01", "01:01:01", "01", "03:03:01",
"05:05:01", "05:01:01", "01:01:01", "05", "03:01:01", "05:01:01",
"01:01:01", "01:01:01", "04:01:01"), I1 = c("03:02:01", "03:01:01",
"05:01:01", "03", "05:03:01", "03:01:01", "03:03:02", "05:01:01",
"06", "02:01:01", "03:02:01", "03:01:01", "05:01:01", "04:02:01"
), J1 = c("01:03:01", "01:03:01", "01:03:01", "01", "01:03:01",
"01:03:01", "02:01:02", "01:03:01", "02", "01:03:01", "02:01:02",
"01:03:01", "01:03:01", "01:03:01"), K1 = c("02:01", "04:01:01",
"03:01", "04:01", "03:01", "23:01:01", "02:01", "04:02:01", "13:01",
"01:01", "03:01", "03:01", "04:01:01", "04:01")), row.names = c(NA,
14L), class = "data.frame")
# CODE --------------------------------------------------------------------
condition <- map_dfc(df, ~!str_detect(., ':') & !str_detect(., '<NA>')) %>%
apply(1, any)
df <- as_tibble(df)
filter(df,!condition)
#> # A tibble: 5 × 11
#> A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 03:02:01 08:01:01 03:04:01 <NA> <NA> <NA> <NA> 03:0… 02:0… 01:0… 01:01
#> 2 11:01:01 40:01:02 07:01:01 <NA> <NA> <NA> <NA> 05:0… 03:0… 02:0… 03:01
#> 3 02:01:01 15:01:01 03:04:01 01:01:01 02:02… <NA> <NA> 01:0… 03:0… 01:0… 03:01
#> 4 02:01:01 07:02:01 05:01:01 01:01:01 <NA> <NA> 01:0… 01:0… 05:0… 01:0… 04:0…
#> 5 02:01:01 35:12:01 04:01:01 08:02:01 <NA> <NA> <NA> 04:0… 04:0… 01:0… 04:01
#or without negation
filter(df,condition)
#> # A tibble: 9 × 11
#> A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 02:01:01 15:01:01 03:03:01 04 02 <NA> <NA> 01:04… 03:0… 01:0… 02:01
#> 2 02:01:01 44:02:01 05:01:01 04 02 <NA> <NA> 03:03… 03:0… 01:0… 04:0…
#> 3 02:01:01 15:01:01 01:02:01 01 <NA> <NA> <NA> 01:01… 05:0… 01:0… 03:01
#> 4 23 38 12 11 02 <NA> <NA> 01 03 01 04:01
#> 5 29:02:01 44:03:01 16:01:01 14 <NA> 01 <NA> 03:03… 05:0… 01:0… 03:01
#> 6 02:01:01 44:02:01 05:01:01 11 <NA> 01 <NA> 05:05… 03:0… 01:0… 23:0…
#> 7 03:01:01 27:05:02 07:02:01 09 <NA> 01:03:02 <NA> 05:01… 03:0… 02:0… 02:01
#> 8 32:01:01 27:05:02 12:03:01 01 <NA> <NA> <NA> 01:01… 05:0… 01:0… 04:0…
#> 9 26 41 17 13 03 <NA> <NA> 05 06 02 13:01
Created on 2021-11-21 by the reprex package (v2.0.1)