How can I expand a group to length of the max group:
df <- structure(list(ID = c(1L, 1L, 2L, 3L, 3L, 3L), col1 = c("A",
"B", "O", "U", "L", "R")), class = "data.frame", row.names = c(NA,
-6L))
ID col1
1 A
1 B
2 O
3 U
3 L
3 R
Desired Output:
1 A
1 B
NA NA
2 O
NA NA
NA NA
3 U
3 L
3 R
CodePudding user response:
Here's a base R solution.
split
the df
by the ID
column, then use lapply
to iterate over the split df
, and rbind
with a data frame of NA
if there's fewer row than 3 (max(table(df$ID))
).
do.call(rbind,
lapply(split(df, df$ID),
\(x) rbind(x, data.frame(ID = NA, col1 = NA)[rep(1, max(table(df$ID)) - nrow(x)), ]))
)
ID col1
1.1 1 A
1.2 1 B
1.3 NA <NA>
2.3 2 O
2.1 NA <NA>
2.1.1 NA <NA>
3.4 3 U
3.5 3 L
3.6 3 R
CodePudding user response:
Another option could be:
df %>%
group_split(ID) %>%
map_dfr(~ rows_append(.x, tibble(col1 = rep(NA_character_, max(rle(df[["ID"]])$lengths) - group_size(.x)))))
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA NA
4 2 O
5 NA NA
6 NA NA
7 3 U
8 3 L
9 3 R
CodePudding user response:
Here is a possible tidyverse
solution. We can use add_row
inside of summarise
to add n number of rows to each group. I use max(count(df, ID)$n)
to get the max group length, then I subtract that from the number of rows in each group to get the total number of rows that need to be added for each group. I use rep
to produce the correct number of values that we need to add for each group. Finally, I replace ID
with NA
when there is an NA
in col1
.
library(tidyverse)
df %>%
group_by(ID) %>%
summarise(add_row(cur_data(),
col1 = rep(NA_character_,
unique(max(count(df, ID)$n) - n()))),
.groups = "drop") %>%
mutate(ID = replace(ID, is.na(col1), NA))
Output
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA NA
4 2 O
5 NA NA
6 NA NA
7 3 U
8 3 L
9 3 R
Or another option without using add_row
:
library(dplyr)
# Get maximum number of rows for all groups
N = max(count(df,ID)$n)
df %>%
group_by(ID) %>%
summarise(col1 = c(col1, rep(NA, N-length(col1))), .groups = "drop") %>%
mutate(ID = replace(ID, is.na(col1), NA))
CodePudding user response:
A base R using merge
rle
merge(
transform(
data.frame(ID = with(rle(df$ID), rep(values, each = max(lengths)))),
q = ave(ID, ID, FUN = seq_along)
),
transform(
df,
q = ave(ID, ID, FUN = seq_along)
),
all = TRUE
)[-2]
gives
ID col1
1 1 A
2 1 B
3 1 <NA>
4 2 O
5 2 <NA>
6 2 <NA>
7 3 U
8 3 L
9 3 R
A data.table
option may also work
> setDT(df)[, .(col1 = `length<-`(col1, max(df[, .N, ID][, N]))), ID]
ID col1
1: 1 A
2: 1 B
3: 1 <NA>
4: 2 O
5: 2 <NA>
6: 2 <NA>
7: 3 U
8: 3 L
9: 3 R
CodePudding user response:
An option to tidyr::complete
the ID and row_new, using row_old to replace ID with NA.
library (tidyverse)
df %>%
group_by(ID) %>%
mutate(
row_new = row_number(),
row_old = row_number()) %>%
ungroup() %>%
complete(ID, row_new) %>%
mutate(ID = if_else(is.na(row_old),
NA_integer_,
ID)) %>%
select(-matches("row_"))
# A tibble: 9 x 2
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA <NA>
4 2 O
5 NA <NA>
6 NA <NA>
7 3 U
8 3 L
9 3 R
CodePudding user response:
n <- max(table(df$ID))
df %>%
group_by(ID) %>%
summarise(col1 =`length<-`(col1, n), .groups = 'drop') %>%
mutate(ID = `is.na<-`(ID, is.na(col1)))
# A tibble: 9 x 2
ID col1
<int> <chr>
1 1 A
2 1 B
3 NA NA
4 2 O
5 NA NA
6 NA NA
7 3 U
8 3 L
9 3 R
CodePudding user response:
You can take advantage of the fact that df[n_bigger_than_nrow,]
gives a row of NA
s
dplyr
max_n <- max(count(df, ID)$n)
df %>%
group_by(ID) %>%
summarise(cur_data()[seq(max_n),])
#> `summarise()` has grouped output by 'ID'. You can override using the `.groups`
#> argument.
#> # A tibble: 9 × 2
#> # Groups: ID [3]
#> ID col1
#> <int> <chr>
#> 1 1 A
#> 2 1 B
#> 3 1 <NA>
#> 4 2 O
#> 5 2 <NA>
#> 6 2 <NA>
#> 7 3 U
#> 8 3 L
#> 9 3 R
base R
n <- tapply(df$ID, df$ID, length)
max_n <- max(n)
i <- c(sapply(n, \(x) c(seq(x), rep(Inf, max_n - x))))
i <- i rep(c(0, cumsum(head(n, -1))), each = max_n)
df <- df[i,]
rownames(df) <- NULL
df$ID <- Reduce(\(x, y) if (is.na(y) && !is.na(x)) x else y, df$ID, accumulate = TRUE)
df
#> ID col1
#> 1 1 A
#> 2 1 B
#> 3 1 <NA>
#> 4 2 O
#> 5 2 <NA>
#> 6 2 <NA>
#> 7 3 U
#> 8 3 L
#> 9 3 R
CodePudding user response:
Another base R solution using sequence
.
print(
df[
sequence(
abs(rep(i <- rle(df$ID)$lengths, each = 2) - c(0L, max(i))),
rep(cumsum(c(1L, i))[-length(i) - 1L], each = 2) c(0L, nrow(df)),
),
],
row.names = FALSE
)
#> ID col1
#> 1 A
#> 1 B
#> NA <NA>
#> 2 O
#> NA <NA>
#> NA <NA>
#> 3 U
#> 3 L
#> 3 R