df <- data.frame(A = c(NA,5,4,NA,1),
B = c(1,NA,1,1,NA),
C = c(3,3,NA,NA,6),
D = c(0,0,1,1,1))
I have something like above dataset and trying to replace the NA values with the mean of the subgroup from target varibale D.
I tried the following code to replace them individually.
df <- df %>%
group_by(D) %>%
mutate(
A = ifelse(is.na(A),
mean(A, na.rm=TRUE),A)
) %>%
mutate(
B = ifelse(is.na(B),
mean(B, na.rm=TRUE),B)
) %>%
mutate(
C = ifelse(is.na(C),
mean(C, na.rm=TRUE),C)
)
Is there more efficent way to impute the mean values?
CodePudding user response:
Perhaps this 'tidyverse' approach will suit:
library(tidyverse)
df <- data.frame(A = c(NA,5,4,NA,1),
B = c(1,NA,1,1,NA),
C = c(3,3,NA,NA,6),
D = c(0,0,1,1,1))
df_output <- df %>%
group_by(D) %>%
mutate(
A = ifelse(is.na(A),
mean(A, na.rm=TRUE),A)
) %>%
mutate(
B = ifelse(is.na(B),
mean(B, na.rm=TRUE),B)
) %>%
mutate(
C = ifelse(is.na(C),
mean(C, na.rm=TRUE),C)
)
df_output
#> # A tibble: 5 × 4
#> # Groups: D [2]
#> A B C D
#> <dbl> <dbl> <dbl> <dbl>
#> 1 5 1 3 0
#> 2 5 1 3 0
#> 3 4 1 6 1
#> 4 2.5 1 6 1
#> 5 1 1 6 1
df_output_2 <- df %>%
group_by(D) %>%
mutate(across(A:C, ~replace_na(.x, mean(.x, na.rm = TRUE))))
df_output_2
#> # A tibble: 5 × 4
#> # Groups: D [2]
#> A B C D
#> <dbl> <dbl> <dbl> <dbl>
#> 1 5 1 3 0
#> 2 5 1 3 0
#> 3 4 1 6 1
#> 4 2.5 1 6 1
#> 5 1 1 6 1
all_equal(df_output, df_output_2)
#> [1] TRUE
Created on 2022-10-04 by the reprex package (v2.0.1)
CodePudding user response:
I encountered the same problem before but my dataset was bigger. In these cases, I use mutate_all
df %>% group_by(D) %>% mutate_all(funs(replace(., is.na(.), mean(., na.rm = TRUE))))
A B C D
<dbl> <dbl> <dbl> <dbl>
1 5 1 3 0
2 5 1 3 0
3 4 1 6 1
4 2.5 1 6 1
5 1 1 6 1