Home > Enterprise >  Expand each group to the max n of rows
Expand each group to the max n of rows

Time:08-14

How can I expand a group to length of the max group:

df <- structure(list(ID = c(1L, 1L, 2L, 3L, 3L, 3L), col1 = c("A", 
"B", "O", "U", "L", "R")), class = "data.frame", row.names = c(NA, 
-6L))


ID col1
1 A
1 B
2 O
3 U
3 L
3 R

Desired Output:

1 A
1 B
NA NA
2 O
NA NA
NA NA
3 U
3 L
3 R

CodePudding user response:

Here's a base R solution.

split the df by the ID column, then use lapply to iterate over the split df, and rbind with a data frame of NA if there's fewer row than 3 (max(table(df$ID))).

do.call(rbind, 
        lapply(split(df, df$ID), 
               \(x) rbind(x, data.frame(ID = NA, col1 = NA)[rep(1, max(table(df$ID)) - nrow(x)), ]))
)

      ID col1
1.1    1    A
1.2    1    B
1.3   NA <NA>
2.3    2    O
2.1   NA <NA>
2.1.1 NA <NA>
3.4    3    U
3.5    3    L
3.6    3    R

CodePudding user response:

Another option could be:

df %>%
 group_split(ID) %>%
 map_dfr(~ rows_append(.x, tibble(col1 = rep(NA_character_, max(rle(df[["ID"]])$lengths) - group_size(.x)))))

     ID col1 
  <int> <chr>
1     1 A    
2     1 B    
3    NA NA   
4     2 O    
5    NA NA   
6    NA NA   
7     3 U    
8     3 L    
9     3 R 

CodePudding user response:

Here is a possible tidyverse solution. We can use add_row inside of summarise to add n number of rows to each group. I use max(count(df, ID)$n) to get the max group length, then I subtract that from the number of rows in each group to get the total number of rows that need to be added for each group. I use rep to produce the correct number of values that we need to add for each group. Finally, I replace ID with NA when there is an NA in col1.

library(tidyverse)

df %>%
  group_by(ID) %>%
  summarise(add_row(cur_data(), 
                    col1 = rep(NA_character_, 
                               unique(max(count(df, ID)$n) - n()))), 
            .groups = "drop") %>%
  mutate(ID = replace(ID, is.na(col1), NA))

Output

     ID col1 
  <int> <chr>
1     1 A    
2     1 B    
3    NA NA   
4     2 O    
5    NA NA   
6    NA NA   
7     3 U    
8     3 L    
9     3 R    

Or another option without using add_row:

library(dplyr)

# Get maximum number of rows for all groups
N = max(count(df,ID)$n)

df %>% 
  group_by(ID) %>% 
  summarise(col1 = c(col1, rep(NA, N-length(col1))), .groups = "drop") %>% 
  mutate(ID = replace(ID, is.na(col1), NA))

CodePudding user response:

A base R using merge rle

merge(
  transform(
    data.frame(ID = with(rle(df$ID), rep(values, each = max(lengths)))),
    q = ave(ID, ID, FUN = seq_along)
  ),
  transform(
    df,
    q = ave(ID, ID, FUN = seq_along)
  ),
  all = TRUE
)[-2]

gives

  ID col1
1  1    A
2  1    B
3  1 <NA>
4  2    O
5  2 <NA>
6  2 <NA>
7  3    U
8  3    L
9  3    R

A data.table option may also work

> setDT(df)[, .(col1 = `length<-`(col1, max(df[, .N, ID][, N]))), ID]
   ID col1
1:  1    A
2:  1    B
3:  1 <NA>
4:  2    O
5:  2 <NA>
6:  2 <NA>
7:  3    U
8:  3    L
9:  3    R

CodePudding user response:

An option to tidyr::complete the ID and row_new, using row_old to replace ID with NA.

library (tidyverse)       
df %>%
        group_by(ID) %>%
        mutate(
        row_new = row_number(),
        row_old = row_number()) %>%
        ungroup() %>%
        complete(ID, row_new) %>%
    mutate(ID = if_else(is.na(row_old),
    NA_integer_,
    ID)) %>%
        select(-matches("row_"))

# A tibble: 9 x 2
     ID col1 
  <int> <chr>
1     1 A    
2     1 B    
3    NA <NA> 
4     2 O    
5    NA <NA> 
6    NA <NA> 
7     3 U    
8     3 L    
9     3 R    

CodePudding user response:

n <- max(table(df$ID))

df %>%
  group_by(ID) %>%
  summarise(col1 =`length<-`(col1, n), .groups = 'drop') %>%
  mutate(ID = `is.na<-`(ID, is.na(col1)))

# A tibble: 9 x 2
     ID col1 
  <int> <chr>
1     1 A    
2     1 B    
3    NA NA   
4     2 O    
5    NA NA   
6    NA NA   
7     3 U    
8     3 L    
9     3 R 

CodePudding user response:

You can take advantage of the fact that df[n_bigger_than_nrow,] gives a row of NAs

dplyr

max_n <- max(count(df, ID)$n)

df %>% 
  group_by(ID) %>% 
  summarise(cur_data()[seq(max_n),])
#> `summarise()` has grouped output by 'ID'. You can override using the `.groups`
#> argument.
#> # A tibble: 9 × 2
#> # Groups:   ID [3]
#>      ID col1 
#>   <int> <chr>
#> 1     1 A    
#> 2     1 B    
#> 3     1 <NA> 
#> 4     2 O    
#> 5     2 <NA> 
#> 6     2 <NA> 
#> 7     3 U    
#> 8     3 L    
#> 9     3 R

base R

n <- tapply(df$ID, df$ID, length)
max_n <- max(n)
i <- c(sapply(n, \(x) c(seq(x), rep(Inf, max_n - x))))
i <- i   rep(c(0, cumsum(head(n, -1))), each = max_n)
df <- df[i,]
rownames(df) <- NULL
df$ID <- Reduce(\(x, y) if (is.na(y) && !is.na(x)) x else y, df$ID, accumulate = TRUE)

df
#>   ID col1
#> 1  1    A
#> 2  1    B
#> 3  1 <NA>
#> 4  2    O
#> 5  2 <NA>
#> 6  2 <NA>
#> 7  3    U
#> 8  3    L
#> 9  3    R

CodePudding user response:

Another base R solution using sequence.

print(
  df[
    sequence(
      abs(rep(i <- rle(df$ID)$lengths, each = 2) - c(0L, max(i))),
      rep(cumsum(c(1L, i))[-length(i) - 1L], each = 2)   c(0L, nrow(df)),
    ),
  ],
  row.names = FALSE
)
#>  ID col1
#>   1    A
#>   1    B
#>  NA <NA>
#>   2    O
#>  NA <NA>
#>  NA <NA>
#>   3    U
#>   3    L
#>   3    R
  • Related