R data imputation from group_by table based on count-CodePudding

group = c(1,1,4,4,4,5,5,6,1,4,6,1,1,1,1,6,4,4,4,4,1,4,5,6)
animal = c('a','b','c','c','d','a','b','c','b','d','c','a','a','a','a','c','c','c','c','c','a','c','a','c')
sleep = c('y','n','y','y','y','n','n','y','n','y','n','y','y','n','m','y','n','n','n','n',NA, NA, NA, NA)

test = data.frame(group, animal, sleep)
print(test)

group_animal = test %>% group_by(`group`, `animal`) %>% count(sleep)
print(group_animal)

I would like to replace the NA values in the test df's sleep column by the highest count of sleep answer based on group and animal.

Such that Group 1, Animal a with NAs in the sleep column should have a sleep value of 'y' because that is the value with the highest count among Group 1 Animal a.

Group 4 animal c with NAs for sleep should have 'n' as the sleep value as well.

CodePudding user response：

Update now with group_by(group, animal) thnx @Quinten, removed prior answer:

group by animal
use replace_na with the replace argument as sleep[n==max(n)]
new: in case of ties like in group 5 add !is.na(sleep) to avoid conflicts:

library(dplyr)
library(tidyr)

group_animal %>%
    group_by(group, animal) %>% 
    arrange(desc(sleep), .by_group = TRUE) %>% 
    mutate(sleep = replace_na(sleep, sleep[n==max(n) & !is.na(sleep)]))

  group animal sleep     n
   <dbl> <chr>  <chr> <int>
 1     1 a      y         3
 2     1 a      n         1
 3     1 a      m         1
 4     1 a      y         1
 5     1 b      n         2
 6     4 c      y         2
 7     4 c      n         4
 8     4 c      n         1
 9     4 d      y         2
10     5 a      n         1
11     5 a      n         1
12     5 b      n         1
13     6 c      y         2
14     6 c      n         1
15     6 c      y         1

CodePudding user response：

Another option is replacing the NAs with the Mode. You can use the Mode function from this post in the na.aggregate function from zoo to replace these NAs like this:

Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

group = c(1,1,4,4,4,5,5,6,1,4,6,1,1,1,1,6,4,4,4,4,1,4,5,6)
animal = c('a','b','c','c','d','a','b','c','b','d','c','a','a','a','a','c','c','c','c','c','a','c','a','c')
sleep = c('y','n','y','y','y','n','n','y','n','y','n','y','y','n','m','y','n','n','n','n',NA, NA, NA, NA)
test = data.frame(group, animal, sleep)

library(dplyr)
library(zoo)
test %>%
  group_by(group, animal) %>%
  mutate(sleep = na.aggregate(sleep , FUN=Mode)) %>%
  ungroup()
#> # A tibble: 24 × 3
#>    group animal sleep
#>    <dbl> <chr>  <chr>
#>  1     1 a      y    
#>  2     1 b      n    
#>  3     4 c      y    
#>  4     4 c      y    
#>  5     4 d      y    
#>  6     5 a      n    
#>  7     5 b      n    
#>  8     6 c      y    
#>  9     1 b      n    
#> 10     4 d      y    
#> # … with 14 more rows
#> # ℹ Use `print(n = ...)` to see more rows

^{Created on 2022-07-26 by the reprex package (v2.0.1)}

Here is tail of output:

> tail(test)
# A tibble: 6 × 3
  group animal sleep
  <dbl> <chr>  <chr>
1     4 c      n    
2     4 c      n    
3     1 a      y    
4     4 c      n    
5     5 a      n    
6     6 c      y

CodePudding user response：

Try this.

This method essential creates a custom column to coalesce with sleep, it subsets sleep based on the max count values obtained from str_count

library(dplyr)

test |> 
  group_by(group, animal) |> 
  mutate(sleep = coalesce(sleep, sleep[max(stringr::str_count(paste(sleep, collapse = ""), pattern = sleep), na.rm = TRUE)])) |>
  ungroup()

   group animal sleep
1      1      a     y
2      1      b     n
3      4      c     y
4      4      c     y
5      4      d     y
6      5      a     n
7      5      b     n
8      6      c     y
9      1      b     n
10     4      d     y
11     6      c     n
12     1      a     y
13     1      a     y
14     1      a     n
15     1      a     m
16     6      c     y
17     4      c     n
18     4      c     n
19     4      c     n
20     4      c     n
21     1      a     y
22     4      c     n
23     5      a     n
24     6      c     n