Replace NA in muliple column by group in r-CodePudding

df <- data.frame(A = c(NA,5,4,NA,1),
                 B = c(1,NA,1,1,NA), 
                 C = c(3,3,NA,NA,6), 
                 D = c(0,0,1,1,1))

I have something like above dataset and trying to replace the NA values with the mean of the subgroup from target varibale D.

I tried the following code to replace them individually.

df <- df %>% 
  group_by(D) %>% 
  mutate(
    A = ifelse(is.na(A), 
                mean(A, na.rm=TRUE),A)
  ) %>% 
  mutate(
    B = ifelse(is.na(B), 
                mean(B, na.rm=TRUE),B)
  ) %>% 
  mutate(
    C = ifelse(is.na(C), 
                mean(C, na.rm=TRUE),C)
  )

Is there more efficent way to impute the mean values?

CodePudding user response：

Perhaps this 'tidyverse' approach will suit:

library(tidyverse)

df <- data.frame(A = c(NA,5,4,NA,1),
                 B = c(1,NA,1,1,NA), 
                 C = c(3,3,NA,NA,6), 
                 D = c(0,0,1,1,1))

df_output <- df %>% 
  group_by(D) %>% 
  mutate(
    A = ifelse(is.na(A), 
               mean(A, na.rm=TRUE),A)
  ) %>% 
  mutate(
    B = ifelse(is.na(B), 
               mean(B, na.rm=TRUE),B)
  ) %>% 
  mutate(
    C = ifelse(is.na(C), 
               mean(C, na.rm=TRUE),C)
  )
df_output
#> # A tibble: 5 × 4
#> # Groups:   D [2]
#>       A     B     C     D
#>   <dbl> <dbl> <dbl> <dbl>
#> 1   5       1     3     0
#> 2   5       1     3     0
#> 3   4       1     6     1
#> 4   2.5     1     6     1
#> 5   1       1     6     1

df_output_2 <- df %>%
  group_by(D) %>%
  mutate(across(A:C, ~replace_na(.x, mean(.x, na.rm = TRUE))))
df_output_2
#> # A tibble: 5 × 4
#> # Groups:   D [2]
#>       A     B     C     D
#>   <dbl> <dbl> <dbl> <dbl>
#> 1   5       1     3     0
#> 2   5       1     3     0
#> 3   4       1     6     1
#> 4   2.5     1     6     1
#> 5   1       1     6     1

all_equal(df_output, df_output_2)
#> [1] TRUE

^{Created on 2022-10-04 by the reprex package (v2.0.1)}

CodePudding user response：

I encountered the same problem before but my dataset was bigger. In these cases, I use mutate_all

df %>% group_by(D) %>% mutate_all(funs(replace(., is.na(.), mean(., na.rm = TRUE))))


      A     B     C     D
  <dbl> <dbl> <dbl> <dbl>
1   5       1     3     0
2   5       1     3     0
3   4       1     6     1
4   2.5     1     6     1
5   1       1     6     1