I have a database with one index and a huge number of columns for said index. I want to add a column showing in a value is in any of the other columns for said index.
For example:
library(nycflights13)
flights %>%
select(tailnum,flight) %>%
filter(!is.na(tailnum)) %>%
arrange(tailnum) %>%
group_by(tailnum) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = "id", values_from = "flight")
# A tibble: 4,043 x 576
# Groups: tailnum [4,043]
tailnum `1` `2` `3` `4` `5` `6` `7` `8` `9` `10` `11` `12` `13` `14` `15` `16`
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 D942DN 2247 1685 1959 781 NA NA NA NA NA NA NA NA NA NA NA NA
2 N0EGMQ 4579 4584 4610 4662 4661 4610 4610 4584 4610 4669 4584 4610 4669 4584 4610 4589
3 N10156 4560 4269 4667 4334 4298 4520 4297 4370 4352 4695 4214 4085 4370 4204 4641 4092
4 N102UW 1125 1830 2095 2069 2069 2071 1805 1767 1767 720 720 1103 1619 1431 1435 1288
5 N103US 1575 1427 975 1125 2095 2088 2053 2095 2095 2069 802 2017 2017 1691 1431 1435
6 N104UW 1973 1125 2053 1767 1972 1767 1767 1963 2017 1767 975 1443 1103 1987 1081 1288
7 N10575 4617 4352 4434 4250 4537 4572 4106 3841 4305 3817 3819 4120 4202 4393 4316 4276
8 N105UW 2095 2095 1767 2069 1767 1834 2053 2069 1767 2069 1767 1745 978 1081 1435 1435
9 N107US 1491 2095 2095 1830 2189 1972 1751 2069 2069 1767 1491 1125 1081 1987 1435 1697
10 N108UW 1125 840 2086 2088 2088 1767 1767 2017 1767 1767 1767 1125 1987 1895 1973 1435
I want to add a column at the end of the table showing if any of the other columns contained "1767".
CodePudding user response:
This can be done with if_any
out <- flights %>%
select(tailnum,flight) %>%
filter(!is.na(tailnum)) %>%
arrange(tailnum) %>%
group_by(tailnum) %>%
mutate(id = row_number()) %>%
ungroup %>%
pivot_wider(names_from = "id", values_from = "flight") %>%
mutate(new = if_any(where(is.numeric), ~ . %in% 1767))
-checking
> head(out$new)
[1] FALSE FALSE FALSE TRUE FALSE TRUE
> sort(unique(which(head(out) == 1767, arr.ind = TRUE)[,1]))
[1] 4 6
CodePudding user response:
We can use rowwise
with any
.
library(tidyverse)
library(nycflights13)
df <-
flights %>%
select(tailnum,flight) %>%
filter(!is.na(tailnum)) %>%
arrange(tailnum) %>%
group_by(tailnum) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = "id", values_from = "flight")
df <-
df %>%
rowwise() %>%
mutate(contains_1767 = any(c_across(where(is.numeric)) == 1767, na.rm = TRUE))
df[, ncol(df)]
#> # A tibble: 4,043 × 1
#> # Rowwise:
#> contains_1767
#> <lgl>
#> 1 FALSE
#> 2 FALSE
#> 3 FALSE
#> 4 TRUE
#> 5 FALSE
#> 6 TRUE
#> 7 FALSE
#> 8 TRUE
#> 9 TRUE
#> 10 TRUE
#> # … with 4,033 more rows
system.time({df <-
df %>%
rowwise() %>%
mutate(contains_1767 = any(c_across(where(is.numeric)) == 1767, na.rm = TRUE))})
#> user system elapsed
#> 2.317 0.049 2.367
Created on 2021-12-29 by the reprex package (v2.0.1)
CodePudding user response:
Another possible solution, based on rowSums
:
library(tidyverse)
library(nycflights13)
flights %>%
select(tailnum,flight) %>%
filter(!is.na(tailnum)) %>%
arrange(tailnum) %>%
group_by(tailnum) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = "id", values_from = "flight")%>%
ungroup %>%
mutate(res = pmin(1, rowSums(. == 1767, na.rm = T)), .after=tailnum)
#> # A tibble: 4,043 × 577
#> tailnum res `1` `2` `3` `4` `5` `6` `7` `8` `9` `10`
#> <chr> <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 D942DN 0 2247 1685 1959 781 NA NA NA NA NA NA
#> 2 N0EGMQ 0 4579 4584 4610 4662 4661 4610 4610 4584 4610 4669
#> 3 N10156 0 4560 4269 4667 4334 4298 4520 4297 4370 4352 4695
#> 4 N102UW 1 1125 1830 2095 2069 2069 2071 1805 1767 1767 720
#> 5 N103US 0 1575 1427 975 1125 2095 2088 2053 2095 2095 2069
#> 6 N104UW 1 1973 1125 2053 1767 1972 1767 1767 1963 2017 1767
#> 7 N10575 0 4617 4352 4434 4250 4537 4572 4106 3841 4305 3817
#> 8 N105UW 1 2095 2095 1767 2069 1767 1834 2053 2069 1767 2069
#> 9 N107US 1 1491 2095 2095 1830 2189 1972 1751 2069 2069 1767
#> 10 N108UW 1 1125 840 2086 2088 2088 1767 1767 2017 1767 1767
#> # … with 4,033 more rows, and 565 more variables: 11 <int>, 12 <int>, 13 <int>,
#> # 14 <int>, 15 <int>, 16 <int>, 17 <int>, 18 <int>, 19 <int>, 20 <int>,
#> # 21 <int>, 22 <int>, 23 <int>, 24 <int>, 25 <int>, 26 <int>, 27 <int>,
#> # 28 <int>, 29 <int>, 30 <int>, 31 <int>, 32 <int>, 33 <int>, 34 <int>,
#> # 35 <int>, 36 <int>, 37 <int>, 38 <int>, 39 <int>, 40 <int>, 41 <int>,
#> # 42 <int>, 43 <int>, 44 <int>, 45 <int>, 46 <int>, 47 <int>, 48 <int>,
#> # 49 <int>, 50 <int>, 51 <int>, 52 <int>, 53 <int>, 54 <int>, 55 <int>, …