R: Expand and wider dataset about date-CodePudding

In the example df, I want to move the "group" variable with 5 levels to 5 different variables, and determine which group they belong in. (I know it's kind of weird formatting, but that's easier for future calculation and analysis)

Example df:

|x   |group   |date          |
|1   |1       |2021-01-01    |
|1   |1       |2021-01-02    |
|1   |1       |2021-01-03    |
|1   |2       |2021-01-10    |
|1   |2       |2021-01-11    |
|1   |3       |2021-01-20    |
|1   |3       |2021-01-21    |
|1   |3       |2021-01-22    |
|1   |4       |2021-02-22    |
|1   |5       |2021-03-22    |

Expected result:

|x   |date          |group1   |group2   |group3   |group4   |group5   |
|1   |2021-01-01    |TRUE     |FALSE    |FALSE    |FALSE    |FALSE    |
|1   |2021-01-02    |TRUE     |FALSE    |FALSE    |FALSE    |FALSE    |
|1   |2021-01-03    |TRUE     |FALSE    |FALSE    |FALSE    |FALSE    |
|1   |2021-01-10    |FALSE    |TRUE     |FALSE    |FALSE    |FALSE    |
|1   |2021-01-11    |FALSE    |TRUE     |FALSE    |FALSE    |FALSE    |
|1   |2021-01-20    |FALSE    |FALSE    |TRUE     |FALSE    |FALSE    |
|1   |2021-01-21    |FALSE    |FALSE    |TRUE     |FALSE    |FALSE    |
|1   |2021-01-22    |FALSE    |FALSE    |TRUE     |FALSE    |FALSE    |
|1   |2021-02-22    |FALSE    |FALSE    |FALSE    |TRUE     |FALSE    |
|1   |2021-03-22    |FALSE    |FALSE    |FALSE    |FALSE    |TRUE     |

I've tried pivot_wider, but I don't know where should be the values from. And I've also tried the following codes:

df = df %>%
  mutate(group1= 0,
         group2= 0,
         group3= 0,
         group4= 0,
         group5= 0) %>%
  for (i in 1:nrow(df)){
  if(group == 1){group1 = TRUE}
  else if(group == 2){group2 = TRUE}
  else if(group == 3){group3 = TRUE}
  else if(group == 3){group4 = TRUE}
  else {group5 = TRUE}
  }

And the error is: Error in for (. in i) 1:nrow(df) :4 arguments passed to 'for' which requires 3

CodePudding user response：

You can use tidyverse and pivot_wider for that (slightly different dataset, idea is the same):

library(tidyverse)
df <- data.frame(x = rep(1,10), group = rep(1:5,2), date = seq(1,10))
df <- df %>%  pivot_wider(names_from = group, values_from = group)
df[,3:7] <- apply(df[,3:7],2,is.na)

That gives:

# A tibble: 10 x 7
       x  date `1`   `2`   `3`   `4`   `5`  
   <dbl> <int> <lgl> <lgl> <lgl> <lgl> <lgl>
 1     1     1 FALSE TRUE  TRUE  TRUE  TRUE 
 2     1     2 TRUE  FALSE TRUE  TRUE  TRUE 
 3     1     3 TRUE  TRUE  FALSE TRUE  TRUE 
 4     1     4 TRUE  TRUE  TRUE  FALSE TRUE 
 5     1     5 TRUE  TRUE  TRUE  TRUE  FALSE
 6     1     6 FALSE TRUE  TRUE  TRUE  TRUE 
 7     1     7 TRUE  FALSE TRUE  TRUE  TRUE 
 8     1     8 TRUE  TRUE  FALSE TRUE  TRUE 
 9     1     9 TRUE  TRUE  TRUE  FALSE TRUE 
10     1    10 TRUE  TRUE  TRUE  TRUE  FALSE

CodePudding user response：

After transforming group class to factor we could do:

1. tidyverse way:

library(purrr)
library(tidyr)
library(dplyr)

df %>% 
    mutate(group = as.factor(group),
           group_X = map(group, ~set_names(levels(group) == .x,
                                            levels(group)))) %>% 
    unnest_wider(group_X)

      x group date       group_1 group_2 group_3 group_4 group_5
   <int> <fct> <chr>      <lgl>   <lgl>   <lgl>   <lgl>   <lgl>  
 1     1 1     2021-01-01 TRUE    FALSE   FALSE   FALSE   FALSE  
 2     1 1     2021-01-02 TRUE    FALSE   FALSE   FALSE   FALSE  
 3     1 1     2021-01-03 TRUE    FALSE   FALSE   FALSE   FALSE  
 4     1 2     2021-01-10 FALSE   TRUE    FALSE   FALSE   FALSE  
 5     1 2     2021-01-11 FALSE   TRUE    FALSE   FALSE   FALSE  
 6     1 3     2021-01-20 FALSE   FALSE   TRUE    FALSE   FALSE  
 7     1 3     2021-01-21 FALSE   FALSE   TRUE    FALSE   FALSE  
 8     1 3     2021-01-22 FALSE   FALSE   TRUE    FALSE   FALSE  
 9     1 4     2021-02-22 FALSE   FALSE   FALSE   TRUE    FALSE  
10     1 5     2021-03-22 FALSE   FALSE   FALSE   FALSE   TRUE

2. sapply:

library(dplyr)
df$group <- as.factor(df$group)
df %>%  cbind(sapply(paste0("group_",levels(.$group)), `==`, .$group))

x group       date group_1 group_2 group_3 group_4 group_5
1  1     1 2021-01-01   FALSE   FALSE   FALSE   FALSE   FALSE
2  1     1 2021-01-02   FALSE   FALSE   FALSE   FALSE   FALSE
3  1     1 2021-01-03   FALSE   FALSE   FALSE   FALSE   FALSE
4  1     2 2021-01-10   FALSE   FALSE   FALSE   FALSE   FALSE
5  1     2 2021-01-11   FALSE   FALSE   FALSE   FALSE   FALSE
6  1     3 2021-01-20   FALSE   FALSE   FALSE   FALSE   FALSE
7  1     3 2021-01-21   FALSE   FALSE   FALSE   FALSE   FALSE
8  1     3 2021-01-22   FALSE   FALSE   FALSE   FALSE   FALSE
9  1     4 2021-02-22   FALSE   FALSE   FALSE   FALSE   FALSE
10 1     5 2021-03-22   FALSE   FALSE   FALSE   FALSE   FALSE

CodePudding user response：

library(tidyverse)
library(lubridate)

df = tibble(
  x=1,
  group=sample(1:5,20, replace=TRUE),
  date=sample(seq(ymd("20210101"),
                  ymd("20210201"),
                  ddays(1)),
               20, replace=TRUE)
)

df %>% mutate(
  group1 = group==1,
  group2 = group==2,  
  group3 = group==3,
  group4 = group==4,
  group5 = group==5,
)

Output

 A tibble: 20 x 8
       x group date       group1 group2 group3 group4 group5
   <dbl> <int> <date>     <lgl>  <lgl>  <lgl>  <lgl>  <lgl> 
 1     1     1 2021-01-01 TRUE   FALSE  FALSE  FALSE  FALSE 
 2     1     3 2021-01-01 FALSE  FALSE  TRUE   FALSE  FALSE 
 3     1     5 2021-01-01 FALSE  FALSE  FALSE  FALSE  TRUE  
 4     1     1 2021-01-01 TRUE   FALSE  FALSE  FALSE  FALSE 
 5     1     3 2021-01-01 FALSE  FALSE  TRUE   FALSE  FALSE 
 6     1     4 2021-01-01 FALSE  FALSE  FALSE  TRUE   FALSE 
 7     1     4 2021-01-01 FALSE  FALSE  FALSE  TRUE   FALSE 
 8     1     5 2021-01-01 FALSE  FALSE  FALSE  FALSE  TRUE  
 9     1     4 2021-01-01 FALSE  FALSE  FALSE  TRUE   FALSE 
10     1     5 2021-01-01 FALSE  FALSE  FALSE  FALSE  TRUE  
11     1     2 2021-01-01 FALSE  TRUE   FALSE  FALSE  FALSE 
12     1     1 2021-01-01 TRUE   FALSE  FALSE  FALSE  FALSE 
13     1     5 2021-01-01 FALSE  FALSE  FALSE  FALSE  TRUE  
14     1     1 2021-01-01 TRUE   FALSE  FALSE  FALSE  FALSE 
15     1     1 2021-01-01 TRUE   FALSE  FALSE  FALSE  FALSE 
16     1     3 2021-01-01 FALSE  FALSE  TRUE   FALSE  FALSE 
17     1     4 2021-01-01 FALSE  FALSE  FALSE  TRUE   FALSE 
18     1     5 2021-01-01 FALSE  FALSE  FALSE  FALSE  TRUE  
19     1     2 2021-01-01 FALSE  TRUE   FALSE  FALSE  FALSE 
20     1     5 2021-01-01 FALSE  FALSE  FALSE  FALSE  TRUE

CodePudding user response：

We can use pivot_wider from tidyr to get the data in wide format. In values_fn we can return TRUE for the group values that is present and return FALSE otherwise using values_fill argument.

tidyr::pivot_wider(df, 
                   names_from = group, 
                   values_from = group, 
                   values_fn = function(x) TRUE, 
                   names_prefix = 'group', 
                   values_fill = FALSE)

#       x date       group1 group2 group3 group4 group5
#   <int> <chr>      <lgl>  <lgl>  <lgl>  <lgl>  <lgl> 
# 1     1 2021-01-01 TRUE   FALSE  FALSE  FALSE  FALSE 
# 2     1 2021-01-02 TRUE   FALSE  FALSE  FALSE  FALSE 
# 3     1 2021-01-03 TRUE   FALSE  FALSE  FALSE  FALSE 
# 4     1 2021-01-10 FALSE  TRUE   FALSE  FALSE  FALSE 
# 5     1 2021-01-11 FALSE  TRUE   FALSE  FALSE  FALSE 
# 6     1 2021-01-20 FALSE  FALSE  TRUE   FALSE  FALSE 
# 7     1 2021-01-21 FALSE  FALSE  TRUE   FALSE  FALSE 
# 8     1 2021-01-22 FALSE  FALSE  TRUE   FALSE  FALSE 
# 9     1 2021-02-22 FALSE  FALSE  FALSE  TRUE   FALSE 
#10     1 2021-03-22 FALSE  FALSE  FALSE  FALSE  TRUE

data

It is easier to help if you provide data in a reproducible format

df <- structure(list(x = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
    group = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 5L), date = c("2021-01-01", 
    "2021-01-02", "2021-01-03", "2021-01-10", "2021-01-11", "2021-01-20", 
    "2021-01-21", "2021-01-22", "2021-02-22", "2021-03-22")), row.names = c(NA, 
-10L), class = "data.frame")