df <- data.frame(A1 = c(6, 8, NA, 1, 5),
A2 = c(NA, NA, 9, 3, 6),
A3 = c(9, NA, 1, NA, 4),
B1 = c(NA, NA, 9, 3, 6),
B2 = c(9, NA, 1, NA, 4),
B3 = c(NA, NA, 9, 3, 6)
)
I have a dataset with multiple questionnaires that each have multiple items. I would like to replace the missing data with the row mean of the observable values for each of the questionnaires (missing values in A items replaced by row mean of A1 to A3 and missing values in B items replaces by row mean of B1 to B3). What is the best way to do that?
CodePudding user response:
You may try
df <- data.frame(A1 = c(6, 8, NA, 1, 5),
A2 = c(NA, NA, 9, 3, 6),
A3 = c(9, NA, 1, NA, 4),
B1 = c(NA, NA, 9, 3, 6),
B2 = c(9, NA, 1, NA, 4),
B3 = c(NA, NA, 9, 3, 6)
)
df1 <- df %>%
select(starts_with("A"))
df2 <- df %>%
select(starts_with("B"))
x1 <- which(is.na(df1), arr.ind = TRUE)
df1[x1] <- rowMeans(df1, na.rm = T)[x1[,1]]
x2 <- which(is.na(df2), arr.ind = TRUE)
df2[x2] <- rowMeans(df2, na.rm = T)[x2[,1]]
df <- cbind(df1, df2)
df
A1 A2 A3 B1 B2 B3
1 6 7.5 9 9 9 9
2 8 8.0 8 NaN NaN NaN
3 5 9.0 1 9 1 9
4 1 3.0 2 3 3 3
5 5 6.0 4 6 4 6
CodePudding user response:
You may use split.default
to split data in different groups and replace NA
with row-wise mean (taken from this answer https://stackoverflow.com/a/6918323/3962914 )
as.data.frame(lapply(split.default(df, sub('\\d ', '', names(df))), function(x) {
k <- which(is.na(x), arr.ind = TRUE)
x[k] <- rowMeans(x, na.rm = TRUE)[k[, 1]]
x
})) -> result
names(result) <- names(df)
result
# A1 A2 A3 B1 B2 B3
#1 6 7.5 9 9 9 9
#2 8 8.0 8 NaN NaN NaN
#3 5 9.0 1 9 1 9
#4 1 3.0 2 3 3 3
#5 5 6.0 4 6 4 6
CodePudding user response:
You could also do:
library(dplyr)
df %>%
reshape(names(.), dir='long', sep="")%>%
group_by(id) %>%
mutate(across(A:B, ~replace(.x, is.na(.x), mean(.x, na.rm = TRUE))))%>%
pivot_wider(id, names_from = time, values_from = A:B, names_sep = "") %>%
ungroup() %>%
select(-id)
# A tibble: 5 x 6
A1 A2 A3 B1 B2 B3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 6 7.5 9 9 9 9
2 8 8 8 NaN NaN NaN
3 5 9 1 9 1 9
4 1 3 2 3 3 3
5 5 6 4 6 4 6
CodePudding user response:
We can use split.default
with na.aggregate
library(purrr)
library(zoo)
library(dplyr)
library(stringr)
map_dfc(split.default(df, str_remove(names(df), "\\d ")), ~
as_tibble(t(na.aggregate(t(.x)))))
# A tibble: 5 × 6
A1 A2 A3 B1 B2 B3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 6 7.5 9 9 9 9
2 8 8 8 NaN NaN NaN
3 5 9 1 9 1 9
4 1 3 2 3 3 3
5 5 6 4 6 4 6
CodePudding user response:
Span a matrix of rowMeans
on the row
s and replace
the NA's
. In an lapply
that grep
s the questions.
do.call(cbind, lapply(c('A', 'B'), function(q) {
s <- df[, grep(q, names(df))]
na <- is.na(s)
replace(s, na, rowMeans(s, na.rm=TRUE)[row(s)][na])
}))
# A1 A2 A3 B1 B2 B3
# 1 6 7.5 9 9 9 9
# 2 8 8.0 8 NaN NaN NaN
# 3 5 9.0 1 9 1 9
# 4 1 3.0 2 3 3 3
# 5 5 6.0 4 6 4 6
Data:
df <- structure(list(A1 = c(6, 8, NA, 1, 5), A2 = c(NA, NA, 9, 3, 6
), A3 = c(9, NA, 1, NA, 4), B1 = c(NA, NA, 9, 3, 6), B2 = c(9,
NA, 1, NA, 4), B3 = c(NA, NA, 9, 3, 6)), class = "data.frame", row.names = c(NA,
-5L))