How to replace missing data of questionnaire items with row means in R?-CodePudding

df <- data.frame(A1 = c(6, 8, NA, 1, 5),
                A2 = c(NA, NA, 9, 3, 6),
                A3 = c(9, NA, 1, NA, 4),
                B1 = c(NA, NA, 9, 3, 6),
                B2 = c(9, NA, 1, NA, 4),
                B3 = c(NA, NA, 9, 3, 6)
                )

I have a dataset with multiple questionnaires that each have multiple items. I would like to replace the missing data with the row mean of the observable values for each of the questionnaires (missing values in A items replaced by row mean of A1 to A3 and missing values in B items replaces by row mean of B1 to B3). What is the best way to do that?

CodePudding user response：

You may try

df <- data.frame(A1 = c(6, 8, NA, 1, 5),
                    A2 = c(NA, NA, 9, 3, 6),
                    A3 = c(9, NA, 1, NA, 4),
                    B1 = c(NA, NA, 9, 3, 6),
                    B2 = c(9, NA, 1, NA, 4),
                    B3 = c(NA, NA, 9, 3, 6)
)

df1 <- df %>%
  select(starts_with("A"))
df2 <- df %>%
  select(starts_with("B"))

x1 <- which(is.na(df1), arr.ind = TRUE)
df1[x1] <- rowMeans(df1, na.rm = T)[x1[,1]]

x2 <- which(is.na(df2), arr.ind = TRUE)
df2[x2] <- rowMeans(df2, na.rm = T)[x2[,1]]
df <- cbind(df1, df2)
df

  A1  A2 A3  B1  B2  B3
1  6 7.5  9   9   9   9
2  8 8.0  8 NaN NaN NaN
3  5 9.0  1   9   1   9
4  1 3.0  2   3   3   3
5  5 6.0  4   6   4   6

CodePudding user response：

You may use split.default to split data in different groups and replace NA with row-wise mean (taken from this answer https://stackoverflow.com/a/6918323/3962914 )

as.data.frame(lapply(split.default(df, sub('\\d ', '', names(df))), function(x) {
  k <- which(is.na(x), arr.ind = TRUE)
  x[k] <- rowMeans(x, na.rm = TRUE)[k[, 1]]
  x
})) -> result
names(result) <- names(df)
result

#  A1  A2 A3  B1  B2  B3
#1  6 7.5  9   9   9   9
#2  8 8.0  8 NaN NaN NaN
#3  5 9.0  1   9   1   9
#4  1 3.0  2   3   3   3
#5  5 6.0  4   6   4   6

CodePudding user response：

You could also do:

library(dplyr)
df %>%
  reshape(names(.), dir='long', sep="")%>%
  group_by(id) %>%
  mutate(across(A:B, ~replace(.x, is.na(.x), mean(.x, na.rm = TRUE))))%>%
  pivot_wider(id, names_from = time, values_from = A:B, names_sep = "") %>%
  ungroup() %>%
  select(-id)


    # A tibble: 5 x 6
     A1    A2    A3    B1    B2    B3
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1     6   7.5     9     9     9     9
2     8   8       8   NaN   NaN   NaN
3     5   9       1     9     1     9
4     1   3       2     3     3     3
5     5   6       4     6     4     6

CodePudding user response：

We can use split.default with na.aggregate

library(purrr)
library(zoo)
library(dplyr)
library(stringr)
 map_dfc(split.default(df, str_remove(names(df), "\\d ")), ~ 
         as_tibble(t(na.aggregate(t(.x)))))
# A tibble: 5 × 6
     A1    A2    A3    B1    B2    B3
  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1     6   7.5     9     9     9     9
2     8   8       8   NaN   NaN   NaN
3     5   9       1     9     1     9
4     1   3       2     3     3     3
5     5   6       4     6     4     6

CodePudding user response：

Span a matrix of rowMeans on the rows and replace the NA's. In an lapply that greps the questions.

do.call(cbind, lapply(c('A', 'B'), function(q) {
  s <- df[, grep(q, names(df))]
  na <- is.na(s)
  replace(s, na, rowMeans(s, na.rm=TRUE)[row(s)][na])
}))
#   A1  A2 A3  B1  B2  B3
# 1  6 7.5  9   9   9   9
# 2  8 8.0  8 NaN NaN NaN
# 3  5 9.0  1   9   1   9
# 4  1 3.0  2   3   3   3
# 5  5 6.0  4   6   4   6

Data:

df <- structure(list(A1 = c(6, 8, NA, 1, 5), A2 = c(NA, NA, 9, 3, 6
), A3 = c(9, NA, 1, NA, 4), B1 = c(NA, NA, 9, 3, 6), B2 = c(9, 
NA, 1, NA, 4), B3 = c(NA, NA, 9, 3, 6)), class = "data.frame", row.names = c(NA, 
-5L))