Taking the airquality dataset from the MASS library as an example:
> head(airquality)
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
5 NA NA 14.3 56 5 5
6 28 NA 14.9 66 5 6
I want to create three columns:
Missing_Ozone
, Missing_Total
and Missing_Percent
, such that:
Missing_Ozone = 1
if there is a missing value in the Ozone column in the current row.
Missing_total
= total count of missing values in the current row
Missing_Percent
= percentage of missing values in a row.
So for example, in row 1:
Missing_Ozone = 0
, Missing_total = 0
, Missing_percent = 0
In row 5:
Missing_Ozone = 1
, Missing_total = 2
, Missing percent = 100*(2/6)
In row 6:
Missing_Ozone = 0
, Missing_total = 1
, Missing percent = 100*(1/6)
I tried two approaches, without any luck:
The first was to iterate over each row and use an if statement:
library(MASS)
df_test = airquality
df_test$Missing_Ozone <- 0
for(i in 1:nrow(df_test)){
if (is.na(df_test$Ozone)) {
df_test$Missing_Ozone <- 1
}
}
The second was to just use the if-statement inside that for-loop.
Neither work, and I just get:
> df_test
Ozone Solar.R Wind Temp Month Day Missing_Ozone
1 41 190 7.4 67 5 1 0
2 36 118 8.0 72 5 2 0
3 12 149 12.6 74 5 3 0
4 18 313 11.5 62 5 4 0
5 NA NA 14.3 56 5 5 0
Any help is appreciated.
Edit: Also, does this type of data manipulation have a certain name? I found it hard to search online for a guide that goes through this type of data manipulation.
CodePudding user response:
Tidyverse approach:
library(dplyr)
airquality <- datasets::airquality
cols <- ncol(airquality)
airquality <- airquality %>%
mutate(
Missing_Ozone = as.numeric(is.na(Ozone)),
Missing_Total = rowSums(is.na(.)),
Missing_Percent = Missing_Total/cols
)
> head(airquality)
Ozone Solar.R Wind Temp Month Day Missing_Ozone Missing_Total Missing_Percent
1 41 190 7.4 67 5 1 0 0 0.0000000
2 36 118 8.0 72 5 2 0 0 0.0000000
3 12 149 12.6 74 5 3 0 0 0.0000000
4 18 313 11.5 62 5 4 0 0 0.0000000
5 NA NA 14.3 56 5 5 1 2 0.3333333
6 28 NA 14.9 66 5 6 0 1 0.1666667
Base R approach:
cols <- ncol(airquality)
airquality$Missing_Ozone <- as.numeric(is.na(airquality$Ozone))
airquality$Missing_Total <- rowSums(is.na(airquality))
airquality$Missing_Percent <- airquality$Missing_Total/cols
> head(airquality)
> Ozone Solar.R Wind Temp Month Day Missing_Ozone Missing_Total Missing_Percent
1 41 190 7.4 67 5 1 0 0 0.0000000
2 36 118 8.0 72 5 2 0 0 0.0000000
3 12 149 12.6 74 5 3 0 0 0.0000000
4 18 313 11.5 62 5 4 0 0 0.0000000
5 NA NA 14.3 56 5 5 1 2 0.3333333
6 28 NA 14.9 66 5 6 0 1 0.1666667
edit: A note on performance
I would advise in general against usage of rowwise
operations outside of very specific use cases. It will slow you down heavily as your data set scales. The execution time tends to grow linearly with your data, which is really, really bad. A little benchmark with a data set size of 6,426
rows instead of 153
:
library(dplyr)
library(microbenchmark)
airquality <- datasets::airquality
# Rowwise
approachA <- function(data) {
result <- data %>%
mutate(Missing_Ozone = as.integer(is.na(Ozone))) %>%
rowwise() %>%
mutate(Missing_Total = sum(is.na((c_across(-Missing_Ozone))))) %>%
mutate(Missing_Percent = Missing_Total/ncol(airquality)) %>%
ungroup()
return(result)
}
# Tidy
approachB <- function(data) {
cols <- ncol(data)
result <- data %>%
mutate(
Missing_Ozone = as.numeric(is.na(Ozone)),
Missing_Total = rowSums(is.na(.)),
Missing_Percent = Missing_Total/cols
)
return(result)
}
# Base R
approachC <- function(data) {
cols <- ncol(data)
data$Missing_Ozone <- as.numeric(is.na(data$Ozone))
data$Missing_Total <- rowSums(is.na(data))
data$Missing_Percent <- data$Missing_Total/cols
return(data)
}
Result with data x 42: rowwise()
has led to some orders of magnitude worse performance over both proposed approaches.
> test_data <- do.call("rbind", replicate(42, airquality, simplify = FALSE))
> set.seed(42)
> microbenchmark::microbenchmark(approachA(test_data), approachB(test_data), approachC(test_data))
Unit: microseconds
expr min lq mean median uq max neval cld
approachA(test_data) 243340.904 251838.3590 259083.8089 256546.9015 260567.8945 405326.615 100 b
approachB(test_data) 577.977 624.0610 723.8304 741.0955 770.3695 2382.756 100 a
approachC(test_data) 102.377 107.9735 139.5595 119.6175 129.4165 2074.231 100 a
Result with data x 420: Execution time of rowwise
approach has grown 10x.
test_data <- do.call("rbind", replicate(420, airquality, simplify = FALSE))
> set.seed(42)
> microbenchmark::microbenchmark(approachA(test_data), approachB(test_data), approachC(test_data))
Unit: microseconds
expr min lq mean median uq max neval cld
approachA(test_data) 2519480.258 2620528.08 2671419.663 2672263.417 2707896.209 2907659.730 100 b
approachB(test_data) 1266.818 1488.71 1909.167 1576.327 1678.725 21011.147 100 a
approachC(test_data) 808.684 881.09 1220.151 1000.277 1067.907 8218.655 100 a
CodePudding user response:
A solution using the dplyr
package. rowwise
and c_cross
allow us to do calculation by each row.
library(dplyr)
dat <- airquality %>%
mutate(Missing_Ozone = as.integer(is.na(Ozone))) %>%
rowwise() %>%
mutate(Missing_Total = sum(is.na((c_across(-Missing_Ozone))))) %>%
mutate(Missing_Percent = Missing_Total/ncol(airquality)) %>%
ungroup()
dat
# # A tibble: 153 x 9
# Ozone Solar.R Wind Temp Month Day Missing_Ozone Missing_Total Missing_Percent
# <int> <int> <dbl> <int> <int> <int> <int> <int> <dbl>
# 1 41 190 7.4 67 5 1 0 0 0
# 2 36 118 8 72 5 2 0 0 0
# 3 12 149 12.6 74 5 3 0 0 0
# 4 18 313 11.5 62 5 4 0 0 0
# 5 NA NA 14.3 56 5 5 1 2 0.333
# 6 28 NA 14.9 66 5 6 0 1 0.167
# 7 23 299 8.6 65 5 7 0 0 0
# 8 19 99 13.8 59 5 8 0 0 0
# 9 8 19 20.1 61 5 9 0 0 0
# 10 NA 194 8.6 69 5 10 1 1 0.167
# # ... with 143 more rows