I want to know how to write a loop on my dataset on repeat filters:
My sample dataset1:
df= structure(list(system = c("1-Jan-16", "2-Jan-16", "3-Jan-16",
"4-Jan-16"), evi1500 = c(0.437, 0.408, 0.429, NA), evi21500 = c(0.3891771,
0.38915543, 0.389133761, 0.389112091), kndvi1500 = c(0.493, 0.471,
0.769, 0.223), ndvi1500 = c(0.261, 0.698, 0.645, 0.627), nirv1500 = c(0.444426458,
0.444472048, 0.444517639, 0.444563229), evi2500 = c(0.366, 0.33,
0.367, 0.608), evi22500 = c(0.74, 0.241, 0.424, 0.398), kndvi2500 = c(0.41,
0.384, 0.684, 0.173), ndvi2500 = c(0.474621566, 0.474655555,
0.474689544, 0.474723532), nirv2500 = c(0.362, 0.596, 0.145,
0.442)), row.names = c(NA, 4L), class = "data.frame")
code1
outliersevi1500=hampel_outlier(df$evi1500,k_mad_value = 3)
outliersevi1500
outliersevi21500=hampel_outlier(df$evi21500,k_mad_value = 3)
outliersevi21500
outlierskndvi1500=hampel_outlier(df$kndvi1500,k_mad_value = 3)
outlierskndvi1500
df$evi1500[df$evi1500 < 0.1992968 | df$evi1500 > 0.5907032 ] <- NA
df$evi21500[df$evi21500 < 0.2243160 | df$evi21500 > 0.5534532 ] <- NA
df$kndvi1500[df$kndvi1500 < 0.1596835 | df$kndvi1500 > 0.7749794 ] <- NA
Thanks in advance for your help.
CodePudding user response:
hampel_outlier
returns upper and lower limits for outlier detection. Values outside this interval should be removed by setting their value to NA
. Invervals are determined for each column of df
individually. This filtering should be only applied to columns having either 1500
or 2500
in thier name.
Then you can calculate your thresholds and do the outlier replacement like this:
library(tidyverse)
library(funModeling)
#> Loading required package: Hmisc
#> Loading required package: lattice
#> Loading required package: survival
#> Loading required package: Formula
#>
#> Attaching package: 'Hmisc'
#> The following objects are masked from 'package:dplyr':
#>
#> src, summarize
#> The following objects are masked from 'package:base':
#>
#> format.pval, units
#> funModeling v.1.9.4 :)
#> Examples and tutorials at livebook.datascienceheroes.com
#> / Now in Spanish: librovivodecienciadedatos.ai
df <- structure(list(system = c(
"1-Jan-16", "2-Jan-16", "3-Jan-16",
"4-Jan-16"
), evi1500 = c(0.437, 0.408, 0.429, NA), evi21500 = c(
0.3891771,
0.38915543, 0.389133761, 0.389112091
), kndvi1500 = c(
0.493, 0.471,
0.769, 0.223
), ndvi1500 = c(0.261, 0.698, 0.645, 0.627), nirv1500 = c(
0.444426458,
0.444472048, 0.444517639, 0.444563229
), evi2500 = c(
0.366, 0.33,
0.367, 0.608
), evi22500 = c(0.74, 0.241, 0.424, 0.398), kndvi2500 = c(
0.41,
0.384, 0.684, 0.173
), ndvi2500 = c(
0.474621566, 0.474655555,
0.474689544, 0.474723532
), nirv2500 = c(
0.362, 0.596, 0.145,
0.442
)), row.names = c(NA, 4L), class = "data.frame")
thresholds <-
df %>%
pivot_longer(-system) %>%
group_by(name) %>%
summarise(outlieres = hampel_outlier(value, k_mad_value = 3) %>% list()) %>%
deframe()
df %>%
mutate(across(matches("1500|2500"), ~ {
(
.x < thresholds[[cur_column()]][["bottom_threshold"]] |
.x > thresholds[[cur_column()]][["top_threshold"]]
) %>%
ifelse(NA, .x)
})) %>%
pivot_wider()
#> # A tibble: 4 x 11
#> system evi1500 evi21500 kndvi1500 ndvi1500 nirv1500 evi2500 evi22500 kndvi2500
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1-Jan… 0.437 0.389 0.493 NA 0.444 0.366 0.74 0.41
#> 2 2-Jan… 0.408 0.389 0.471 0.698 0.444 0.33 0.241 0.384
#> 3 3-Jan… 0.429 0.389 0.769 0.645 0.445 0.367 0.424 0.684
#> 4 4-Jan… NA 0.389 0.223 0.627 0.445 NA 0.398 0.173
#> # … with 2 more variables: ndvi2500 <dbl>, nirv2500 <dbl>
Created on 2021-12-09 by the reprex package (v2.0.1)