Home > Blockchain >  R rowise operations on dummy variables
R rowise operations on dummy variables

Time:02-16

I have a data set of ATM IDs that are coded with dummy variables representing openings and closures for each date. The goal is to generate new columns that tells us when the ATM first appeared in the data, the first time it closed after appearing in the data, and the last date it appeared in the data. Here is the data and output I am expecting:

data <- tribble(
  ~atm_id, ~nov_2019,  ~feb_2020, ~may_2020, ~aug_2020, ~first_appeared, ~closed, ~always_open, ~last_appeared,
  "xx1", 1,  1, 0, 0, "nov_2019", "may_2020", 0, "feb_2020",
  "xx2", 1,  1, 1, 1, "nov_2019", NA, 1, "aug_2020",
  "xx3", 0, 0, 0, 1, "aug_2020", NA, 0, "aug_2020",
  "xx4", 1, 0, 0, 1, "nov_2019", "feb_2020", 0, "aug_2020"
)

Note that atm_id xx3 is a new ATM that only appeared for the first time in aug_2020 so this will get an NA when asked whether it closed.

The last four columns are to be generate using the columns of dummy variables. I currently only have four dates represented, but this data is generate every Nov, Feb, May, and Aug. For example, this is the character vector used to generate the columns for the dates.

column_names <- c("nov_2019", "feb_2020", "may_2020", "aug_2020", "nov_2020", "feb_2021", "may_2021", "aug_2021", "nov_2021")

Is there a way to do this using dplyr or tidyverse packages?

CodePudding user response:

library(tidyverse)
data <- tribble(
  ~atm_id, ~nov_2019, ~feb_2020, ~may_2020, ~aug_2020, ~first_appeared, ~closed, ~always_open, ~last_appeared,
  "xx1", 1, 1, 0, 0, "nov_2019", "may_2020", 0, "feb_2020",
  "xx2", 1, 1, 1, 1, "nov_2019", NA, 1, "aug_2020",
  "xx3", 0, 0, 0, 1, "aug_2020", NA, 0, "aug_2020",
  "xx4", 1, 0, 0, 1, "nov_2019", "feb_2020", 0, "aug_2020"
)
date_levels <- c("nov_2019", "feb_2020", "may_2020", "aug_2020", "nov_2020", "feb_2021", "may_2021", "aug_2021", "nov_2021")

raw_data <-
  data %>%
  select(1:5)

raw_data_long <-
  raw_data %>%
  pivot_longer(-atm_id, names_to = "date", values_to = "open") %>%
  mutate(date = date %>% factor(date_levels)) %>%
  group_by(atm_id)
raw_data_long
#> # A tibble: 16 x 3
#> # Groups:   atm_id [4]
#>    atm_id date      open
#>    <chr>  <fct>    <dbl>
#>  1 xx1    nov_2019     1
#>  2 xx1    feb_2020     1
#>  3 xx1    may_2020     0
#>  4 xx1    aug_2020     0
#>  5 xx2    nov_2019     1
#>  6 xx2    feb_2020     1
#>  7 xx2    may_2020     1
#>  8 xx2    aug_2020     1
#>  9 xx3    nov_2019     0
#> 10 xx3    feb_2020     0
#> 11 xx3    may_2020     0
#> 12 xx3    aug_2020     1
#> 13 xx4    nov_2019     1
#> 14 xx4    feb_2020     0
#> 15 xx4    may_2020     0
#> 16 xx4    aug_2020     1

appeared <-
  raw_data_long %>%
  filter(open == 1) %>%
  arrange(date) %>%
  summarise(
    first_appeared = first(date),
    last_appeared = last(date)
  )
appeared
#> # A tibble: 4 x 3
#>   atm_id first_appeared last_appeared
#>   <chr>  <fct>          <fct>        
#> 1 xx1    nov_2019       feb_2020     
#> 2 xx2    nov_2019       aug_2020     
#> 3 xx3    aug_2020       aug_2020     
#> 4 xx4    nov_2019       aug_2020

always_open <-
  raw_data_long %>%
  mutate(n_open = open %>% keep(~ .x == 1) %>% length()) %>%
  summarise(always_open = as.numeric(n_open == n())) %>%
  distinct(always_open)
#> `summarise()` has grouped output by 'atm_id'. You can override using the `.groups` argument.
always_open
#> # A tibble: 4 x 2
#> # Groups:   atm_id [4]
#>   atm_id always_open
#>   <chr>        <dbl>
#> 1 xx1              0
#> 2 xx2              1
#> 3 xx3              0
#> 4 xx4              0

closed <-
  raw_data_long %>%
  filter(open == 0) %>%
  arrange(date) %>%
  summarise(closed = first(date)) %>%
  anti_join(
    # must start with open date
    raw_data_long %>% filter(first(open) == 0)
  )
#> Joining, by = "atm_id"
closed
#> # A tibble: 2 x 2
#>   atm_id closed  
#>   <chr>  <fct>   
#> 1 xx1    may_2020
#> 2 xx4    feb_2020

raw_data %>%
  left_join(appeared) %>%
  left_join(closed) %>%
  left_join(always_open)
#> Joining, by = "atm_id"
#> Joining, by = "atm_id"
#> Joining, by = "atm_id"
#> # A tibble: 4 x 9
#>   atm_id nov_2019 feb_2020 may_2020 aug_2020 first_appeared last_appeared closed
#>   <chr>     <dbl>    <dbl>    <dbl>    <dbl> <fct>          <fct>         <fct> 
#> 1 xx1           1        1        0        0 nov_2019       feb_2020      may_2…
#> 2 xx2           1        1        1        1 nov_2019       aug_2020      <NA>  
#> 3 xx3           0        0        0        1 aug_2020       aug_2020      <NA>  
#> 4 xx4           1        0        0        1 nov_2019       aug_2020      feb_2…
#> # … with 1 more variable: always_open <dbl>
  • Related