Extract rows having digits without : separator-CodePudding

I have the following dataframe

dput(head(df,14))
structure(list(A1 = c("02:01:01", "02:01:01", "02:01:01", "23", 
"29:02:01", "02:01:01", "03:01:01", "32:01:01", "26", "03:02:01", 
"11:01:01", "02:01:01", "02:01:01", "02:01:01"), B1 = c("15:01:01", 
"44:02:01", "15:01:01", "38", "44:03:01", "44:02:01", "27:05:02", 
"27:05:02", "41", "08:01:01", "40:01:02", "15:01:01", "07:02:01", 
"35:12:01"), C1 = c("03:03:01", "05:01:01", "01:02:01", "12", 
"16:01:01", "05:01:01", "07:02:01", "12:03:01", "17", "03:04:01", 
"07:01:01", "03:04:01", "05:01:01", "04:01:01"), D1 = c("04", 
"04", "01", "11", "14", "11", "09", "01", "13", "<NA>", "<NA>", 
"01:01:01", "01:01:01", "08:02:01"), E1 = c("02", "02", "<NA>", 
"02", "<NA>", "<NA>", "<NA>", "<NA>", "03", "<NA>", "<NA>", "02:02:01", 
"<NA>", "<NA>"), F1 = c("<NA>", "<NA>", "<NA>", "<NA>", "01", 
"01", "01:03:02", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", 
"<NA>"), G1 = c("<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", 
"<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "01:01:01", "<NA>"
), H1 = c("01:04:01", "03:03:01", "01:01:01", "01", "03:03:01", 
"05:05:01", "05:01:01", "01:01:01", "05", "03:01:01", "05:01:01", 
"01:01:01", "01:01:01", "04:01:01"), I1 = c("03:02:01", "03:01:01", 
"05:01:01", "03", "05:03:01", "03:01:01", "03:03:02", "05:01:01", 
"06", "02:01:01", "03:02:01", "03:01:01", "05:01:01", "04:02:01"
), J1 = c("01:03:01", "01:03:01", "01:03:01", "01", "01:03:01", 
"01:03:01", "02:01:02", "01:03:01", "02", "01:03:01", "02:01:02", 
"01:03:01", "01:03:01", "01:03:01"), K1 = c("02:01", "04:01:01", 
"03:01", "04:01", "03:01", "23:01:01", "02:01", "04:02:01", "13:01", 
"01:01", "03:01", "03:01", "04:01:01", "04:01")), row.names = c(NA, 
14L), class = "data.frame")

I want to extract rows on the following conditions:

Rows that have the values without : separator in any column like this

structure(list(A1 = c("02:01:01", "02:01:01", "02:01:01", "23", "29:02:01", "02:01:01", "03:01:01", "32:01:01", "26"), B1 = c("15:01:01", "44:02:01", "15:01:01", "38", "44:03:01", "44:02:01", "27:05:02", "27:05:02", "41"), C1 = c("03:03:01", "05:01:01", "01:02:01", "12", "16:01:01", "05:01:01", "07:02:01", "12:03:01", "17"), D1 = c(4L, 4L, 1L, 11L, 14L, 11L, 9L, 1L, 13L), E1 = c("02", "02", "", "02", "", "", "", "", "03"), F1 = c("", "", "", "", "01", "01", "01:03:02", "", ""), G1 = c("", "", "", "", "", "", "", "", ""), H1 = c("01:04:01", "03:03:01", "01:01:01", "01", "03:03:01", "05:05:01", "05:01:01", "01:01:01", "05"), I1 = c("03:02:01", "03:01:01", "05:01:01", "03", "05:03:01", "03:01:01", "03:03:02", "05:01:01", "06"), J1 = c("01:03:01", "01:03:01", "01:03:01", "01", "01:03:01", "01:03:01", "02:01:02", "01:03:01", "02"), K1 = c("02:01", "04:01:01", "03:01", "04:01", "03:01", "23:01:01", "02:01", "04:02:01", "13:01")), row.names = c(NA, 9L), class = "data.frame")
Rows which has the values with : separator only lie this..

structure(list(A1 = c("03:02:01", "11:01:01", "02:01:01", "02:01:01", "02:01:01"), B1 = c("08:01:01", "40:01:02", "15:01:01", "07:02:01", "35:12:01"), C1 = c("03:04:01", "07:01:01", "03:04:01", "05:01:01", "04:01:01"), D1 = c("", "", "01:01:01", "01:01:01", "08:02:01" ), E1 = c("", "", "02:02:01", "", ""), F1 = c("", "", "", "", ""), G1 = c("", "", "", "01:01:01", ""), H1 = c("03:01:01", "05:01:01", "01:01:01", "01:01:01", "04:01:01"), I1 = c("02:01:01", "03:02:01", "03:01:01", "05:01:01", "04:02:01"), J1 = c("01:03:01", "02:01:02", "01:03:01", "01:03:01", "01:03:01"), K1 = c("01:01", "03:01", "03:01", "04:01:01", "04:01")), row.names = c(NA, 5L), class = "data.frame")

This is what I have tried so far:

df<- with(df, df[ !grepl( ':', A1) | !grepl( ':', B1) | !grepl( ':', C1) | !grepl( ':', D1),])

However, this extracts rows with NA, which also does not have the : separator.

CodePudding user response：

Here is one approach using dplyr.

You can filter checking all columns to contain either the colon (:) or character value of "NA" (note in your example, they are character values and not NA symbol). Here you can use if_all or across:

library(dplyr)

df %>% 
  filter(if_all(everything(), ~ grepl(":|<NA>", .)))

Output

        A1       B1       C1       D1       E1   F1       G1       H1       I1       J1       K1
1 03:02:01 08:01:01 03:04:01     <NA>     <NA> <NA>     <NA> 03:01:01 02:01:01 01:03:01    01:01
2 11:01:01 40:01:02 07:01:01     <NA>     <NA> <NA>     <NA> 05:01:01 03:02:01 02:01:02    03:01
3 02:01:01 15:01:01 03:04:01 01:01:01 02:02:01 <NA>     <NA> 01:01:01 03:01:01 01:03:01    03:01
4 02:01:01 07:02:01 05:01:01 01:01:01     <NA> <NA> 01:01:01 01:01:01 05:01:01 01:03:01 04:01:01
5 02:01:01 35:12:01 04:01:01 08:02:01     <NA> <NA>     <NA> 04:01:01 04:02:01 01:03:01    04:01

To get rows where there are characters without the colon, you can use if_any:

df %>% 
  filter(if_any(everything(), ~ !grepl(":|<NA>", .)))

Output

        A1       B1       C1 D1   E1       F1   G1       H1       I1       J1       K1
1 02:01:01 15:01:01 03:03:01 04   02     <NA> <NA> 01:04:01 03:02:01 01:03:01    02:01
2 02:01:01 44:02:01 05:01:01 04   02     <NA> <NA> 03:03:01 03:01:01 01:03:01 04:01:01
3 02:01:01 15:01:01 01:02:01 01 <NA>     <NA> <NA> 01:01:01 05:01:01 01:03:01    03:01
4       23       38       12 11   02     <NA> <NA>       01       03       01    04:01
5 29:02:01 44:03:01 16:01:01 14 <NA>       01 <NA> 03:03:01 05:03:01 01:03:01    03:01
6 02:01:01 44:02:01 05:01:01 11 <NA>       01 <NA> 05:05:01 03:01:01 01:03:01 23:01:01
7 03:01:01 27:05:02 07:02:01 09 <NA> 01:03:02 <NA> 05:01:01 03:03:02 02:01:02    02:01
8 32:01:01 27:05:02 12:03:01 01 <NA>     <NA> <NA> 01:01:01 05:01:01 01:03:01 04:02:01
9       26       41       17 13   03     <NA> <NA>       05       06       02    13:01

Finally, if your real data has NA symbol and not character values containing "NA", then you can try:

df %>% 
  filter(if_all(everything(), ~ grepl(":", .) | is.na(.)))

and

df %>% 
  filter(if_any(everything(), ~ !(grepl(":", .) | is.na(.))))

CodePudding user response：

An alternative to across() is to use map/apply functions.

library(tidyverse)


# DATA --------------------------------------------------------------------


df <- 
structure(list(A1 = c("02:01:01", "02:01:01", "02:01:01", "23", 
                      "29:02:01", "02:01:01", "03:01:01", "32:01:01", "26", "03:02:01", 
                      "11:01:01", "02:01:01", "02:01:01", "02:01:01"), B1 = c("15:01:01", 
                                                                              "44:02:01", "15:01:01", "38", "44:03:01", "44:02:01", "27:05:02", 
                                                                              "27:05:02", "41", "08:01:01", "40:01:02", "15:01:01", "07:02:01", 
                                                                              "35:12:01"), C1 = c("03:03:01", "05:01:01", "01:02:01", "12", 
                                                                                                  "16:01:01", "05:01:01", "07:02:01", "12:03:01", "17", "03:04:01", 
                                                                                                  "07:01:01", "03:04:01", "05:01:01", "04:01:01"), D1 = c("04", 
                                                                                                                                                          "04", "01", "11", "14", "11", "09", "01", "13", "<NA>", "<NA>", 
                                                                                                                                                          "01:01:01", "01:01:01", "08:02:01"), E1 = c("02", "02", "<NA>", 
                                                                                                                                                                                                      "02", "<NA>", "<NA>", "<NA>", "<NA>", "03", "<NA>", "<NA>", "02:02:01", 
                                                                                                                                                                                                      "<NA>", "<NA>"), F1 = c("<NA>", "<NA>", "<NA>", "<NA>", "01", 
                                                                                                                                                                                                                              "01", "01:03:02", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", 
                                                                                                                                                                                                                              "<NA>"), G1 = c("<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", 
                                                                                                                                                                                                                                              "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "<NA>", "01:01:01", "<NA>"
                                                                                                                                                                                                                              ), H1 = c("01:04:01", "03:03:01", "01:01:01", "01", "03:03:01", 
                                                                                                                                                                                                                                        "05:05:01", "05:01:01", "01:01:01", "05", "03:01:01", "05:01:01", 
                                                                                                                                                                                                                                        "01:01:01", "01:01:01", "04:01:01"), I1 = c("03:02:01", "03:01:01", 
                                                                                                                                                                                                                                                                                    "05:01:01", "03", "05:03:01", "03:01:01", "03:03:02", "05:01:01", 
                                                                                                                                                                                                                                                                                    "06", "02:01:01", "03:02:01", "03:01:01", "05:01:01", "04:02:01"
                                                                                                                                                                                                                                        ), J1 = c("01:03:01", "01:03:01", "01:03:01", "01", "01:03:01", 
                                                                                                                                                                                                                                                  "01:03:01", "02:01:02", "01:03:01", "02", "01:03:01", "02:01:02", 
                                                                                                                                                                                                                                                  "01:03:01", "01:03:01", "01:03:01"), K1 = c("02:01", "04:01:01", 
                                                                                                                                                                                                                                                                                              "03:01", "04:01", "03:01", "23:01:01", "02:01", "04:02:01", "13:01", 
                                                                                                                                                                                                                                                                                              "01:01", "03:01", "03:01", "04:01:01", "04:01")), row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                                              14L), class = "data.frame")


# CODE --------------------------------------------------------------------


condition <- map_dfc(df, ~!str_detect(., ':') & !str_detect(., '<NA>')) %>% 
              apply(1, any)

df <- as_tibble(df)

filter(df,!condition)
#> # A tibble: 5 × 11
#>   A1       B1       C1       D1       E1     F1    G1    H1    I1    J1    K1   
#>   <chr>    <chr>    <chr>    <chr>    <chr>  <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 03:02:01 08:01:01 03:04:01 <NA>     <NA>   <NA>  <NA>  03:0… 02:0… 01:0… 01:01
#> 2 11:01:01 40:01:02 07:01:01 <NA>     <NA>   <NA>  <NA>  05:0… 03:0… 02:0… 03:01
#> 3 02:01:01 15:01:01 03:04:01 01:01:01 02:02… <NA>  <NA>  01:0… 03:0… 01:0… 03:01
#> 4 02:01:01 07:02:01 05:01:01 01:01:01 <NA>   <NA>  01:0… 01:0… 05:0… 01:0… 04:0…
#> 5 02:01:01 35:12:01 04:01:01 08:02:01 <NA>   <NA>  <NA>  04:0… 04:0… 01:0… 04:01

#or without negation

filter(df,condition)
#> # A tibble: 9 × 11
#>   A1       B1       C1       D1    E1    F1       G1    H1     I1    J1    K1   
#>   <chr>    <chr>    <chr>    <chr> <chr> <chr>    <chr> <chr>  <chr> <chr> <chr>
#> 1 02:01:01 15:01:01 03:03:01 04    02    <NA>     <NA>  01:04… 03:0… 01:0… 02:01
#> 2 02:01:01 44:02:01 05:01:01 04    02    <NA>     <NA>  03:03… 03:0… 01:0… 04:0…
#> 3 02:01:01 15:01:01 01:02:01 01    <NA>  <NA>     <NA>  01:01… 05:0… 01:0… 03:01
#> 4 23       38       12       11    02    <NA>     <NA>  01     03    01    04:01
#> 5 29:02:01 44:03:01 16:01:01 14    <NA>  01       <NA>  03:03… 05:0… 01:0… 03:01
#> 6 02:01:01 44:02:01 05:01:01 11    <NA>  01       <NA>  05:05… 03:0… 01:0… 23:0…
#> 7 03:01:01 27:05:02 07:02:01 09    <NA>  01:03:02 <NA>  05:01… 03:0… 02:0… 02:01
#> 8 32:01:01 27:05:02 12:03:01 01    <NA>  <NA>     <NA>  01:01… 05:0… 01:0… 04:0…
#> 9 26       41       17       13    03    <NA>     <NA>  05     06    02    13:01

^{Created on 2021-11-21 by the reprex package (v2.0.1)}