Add a column based on all other columns in DB-CodePudding

I have a database with one index and a huge number of columns for said index. I want to add a column showing in a value is in any of the other columns for said index.

For example:

library(nycflights13)
flights %>% 
  select(tailnum,flight) %>%
  filter(!is.na(tailnum)) %>% 
  arrange(tailnum) %>% 
  group_by(tailnum) %>% 
  mutate(id = row_number()) %>%
  pivot_wider(names_from = "id", values_from = "flight")

# A tibble: 4,043 x 576
# Groups:   tailnum [4,043]
   tailnum   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`  `10`  `11`  `12`  `13`  `14`  `15`  `16`
   <chr>   <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
 1 D942DN   2247  1685  1959   781    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
 2 N0EGMQ   4579  4584  4610  4662  4661  4610  4610  4584  4610  4669  4584  4610  4669  4584  4610  4589
 3 N10156   4560  4269  4667  4334  4298  4520  4297  4370  4352  4695  4214  4085  4370  4204  4641  4092
 4 N102UW   1125  1830  2095  2069  2069  2071  1805  1767  1767   720   720  1103  1619  1431  1435  1288
 5 N103US   1575  1427   975  1125  2095  2088  2053  2095  2095  2069   802  2017  2017  1691  1431  1435
 6 N104UW   1973  1125  2053  1767  1972  1767  1767  1963  2017  1767   975  1443  1103  1987  1081  1288
 7 N10575   4617  4352  4434  4250  4537  4572  4106  3841  4305  3817  3819  4120  4202  4393  4316  4276
 8 N105UW   2095  2095  1767  2069  1767  1834  2053  2069  1767  2069  1767  1745   978  1081  1435  1435
 9 N107US   1491  2095  2095  1830  2189  1972  1751  2069  2069  1767  1491  1125  1081  1987  1435  1697
10 N108UW   1125   840  2086  2088  2088  1767  1767  2017  1767  1767  1767  1125  1987  1895  1973  1435

I want to add a column at the end of the table showing if any of the other columns contained "1767".

CodePudding user response：

This can be done with if_any

out <- flights %>% 
  select(tailnum,flight) %>%
  filter(!is.na(tailnum)) %>% 
  arrange(tailnum) %>% 
  group_by(tailnum) %>% 
  mutate(id = row_number()) %>%
  ungroup %>%
  pivot_wider(names_from = "id", values_from = "flight") %>%
   mutate(new = if_any(where(is.numeric), ~ . %in% 1767))

-checking

> head(out$new)
[1] FALSE FALSE FALSE  TRUE FALSE  TRUE
> sort(unique(which(head(out) == 1767, arr.ind = TRUE)[,1]))
[1] 4 6

CodePudding user response：

We can use rowwise with any.

library(tidyverse)

library(nycflights13)
df <- 
flights %>% 
  select(tailnum,flight) %>%
  filter(!is.na(tailnum)) %>% 
  arrange(tailnum) %>% 
  group_by(tailnum) %>% 
  mutate(id = row_number()) %>%
  pivot_wider(names_from = "id", values_from = "flight")

df <- 
df %>% 
  rowwise() %>% 
  mutate(contains_1767 = any(c_across(where(is.numeric)) == 1767, na.rm = TRUE))

df[, ncol(df)]
#> # A tibble: 4,043 × 1
#> # Rowwise: 
#>    contains_1767
#>    <lgl>        
#>  1 FALSE        
#>  2 FALSE        
#>  3 FALSE        
#>  4 TRUE         
#>  5 FALSE        
#>  6 TRUE         
#>  7 FALSE        
#>  8 TRUE         
#>  9 TRUE         
#> 10 TRUE         
#> # … with 4,033 more rows


system.time({df <- 
  df %>% 
  rowwise() %>% 
  mutate(contains_1767 = any(c_across(where(is.numeric)) == 1767, na.rm = TRUE))})
#>    user  system elapsed 
#>   2.317   0.049   2.367

^{Created on 2021-12-29 by the reprex package (v2.0.1)}

CodePudding user response：

Another possible solution, based on rowSums:

library(tidyverse)

library(nycflights13)
flights %>% 
  select(tailnum,flight) %>%
  filter(!is.na(tailnum)) %>% 
  arrange(tailnum) %>% 
  group_by(tailnum) %>% 
  mutate(id = row_number()) %>%
  pivot_wider(names_from = "id", values_from = "flight")%>% 
  ungroup %>% 
  mutate(res = pmin(1, rowSums(. == 1767, na.rm = T)), .after=tailnum)

#> # A tibble: 4,043 × 577
#>    tailnum   res   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`  `10`
#>    <chr>   <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#>  1 D942DN      0  2247  1685  1959   781    NA    NA    NA    NA    NA    NA
#>  2 N0EGMQ      0  4579  4584  4610  4662  4661  4610  4610  4584  4610  4669
#>  3 N10156      0  4560  4269  4667  4334  4298  4520  4297  4370  4352  4695
#>  4 N102UW      1  1125  1830  2095  2069  2069  2071  1805  1767  1767   720
#>  5 N103US      0  1575  1427   975  1125  2095  2088  2053  2095  2095  2069
#>  6 N104UW      1  1973  1125  2053  1767  1972  1767  1767  1963  2017  1767
#>  7 N10575      0  4617  4352  4434  4250  4537  4572  4106  3841  4305  3817
#>  8 N105UW      1  2095  2095  1767  2069  1767  1834  2053  2069  1767  2069
#>  9 N107US      1  1491  2095  2095  1830  2189  1972  1751  2069  2069  1767
#> 10 N108UW      1  1125   840  2086  2088  2088  1767  1767  2017  1767  1767
#> # … with 4,033 more rows, and 565 more variables: 11 <int>, 12 <int>, 13 <int>,
#> #   14 <int>, 15 <int>, 16 <int>, 17 <int>, 18 <int>, 19 <int>, 20 <int>,
#> #   21 <int>, 22 <int>, 23 <int>, 24 <int>, 25 <int>, 26 <int>, 27 <int>,
#> #   28 <int>, 29 <int>, 30 <int>, 31 <int>, 32 <int>, 33 <int>, 34 <int>,
#> #   35 <int>, 36 <int>, 37 <int>, 38 <int>, 39 <int>, 40 <int>, 41 <int>,
#> #   42 <int>, 43 <int>, 44 <int>, 45 <int>, 46 <int>, 47 <int>, 48 <int>,
#> #   49 <int>, 50 <int>, 51 <int>, 52 <int>, 53 <int>, 54 <int>, 55 <int>, …