Home > Software engineering >  Winsorize to 1th and 99th (or 25th to 75th) percentile in a dataset in R
Winsorize to 1th and 99th (or 25th to 75th) percentile in a dataset in R

Time:10-20

I want to limit each of the AI_ columns to those that are above a certain threshold (in this case 0.0), and then return the standard deviation of those that fall within the 1st and 99th (or 25th to 75th) percentile

I used a function that limits a vector x to those that are above the threshold, then gets the 1st to 99th percentile, and return the st dev of the values that fall within this range.

f <- function(x, thresh=0) {
  x <- x[x>thresh]
  lims = quantile(x, probs=c(0.01,0.99))
  sd(x[between(x,lims[1],lims[2])])
}

and then:

AI %>% group_by(ID) %>% summarize(across(starts_with("AI"), f))

Then I got the following ERRORs

Error in summarize(., across(starts_with("AI"), f)) : i The error occurred in group 11: ID = "ID19". Caused by error in across(): ! Problem while computing column AI_2. Caused by error in quantile.default(): ! missing values and NaN's not allowed if 'na.rm' is FALSE

I also used the following code for to keep values between the 1st and 99 percentile but that only works for 1 column and does not include the treshold..

keep <- AI$AI_1 >= quantile(AI$AI_1, .01) & AI$AI_1 <= quantile(AI$AI_1, .99)
AI_2 <- AI[keep,]

This is a sample of the dataset (does not include NA values..):

structure(list(X = 1:100, x1.time = c("00:00:00", "00:01:00", 
"00:02:00", "00:03:00", "00:04:00", "00:05:00", "00:06:00", "00:07:00", 
"00:08:00", "00:09:00", "00:10:00", "00:11:00", "00:12:00", "00:13:00", 
"00:14:00", "00:15:00", "00:16:00", "00:17:00", "00:18:00", "00:19:00", 
"00:20:00", "00:21:00", "00:22:00", "00:23:00", "00:24:00", "00:25:00", 
"00:26:00", "00:27:00", "00:28:00", "00:29:00", "00:30:00", "00:31:00", 
"00:32:00", "00:33:00", "00:34:00", "00:35:00", "00:36:00", "00:37:00", 
"00:38:00", "00:39:00", "00:40:00", "00:41:00", "00:42:00", "00:43:00", 
"00:44:00", "00:45:00", "00:46:00", "00:47:00", "00:48:00", "00:49:00", 
"00:50:00", "00:51:00", "00:52:00", "00:53:00", "00:54:00", "00:55:00", 
"00:56:00", "00:57:00", "00:58:00", "00:59:00", "01:00:00", "01:01:00", 
"01:02:00", "01:03:00", "01:04:00", "01:05:00", "01:06:00", "01:07:00", 
"01:08:00", "01:09:00", "01:10:00", "01:11:00", "01:12:00", "01:13:00", 
"01:14:00", "01:15:00", "01:16:00", "01:17:00", "01:18:00", "01:19:00", 
"01:20:00", "01:21:00", "01:22:00", "01:23:00", "01:24:00", "01:25:00", 
"01:26:00", "01:27:00", "01:28:00", "01:29:00", "01:30:00", "01:31:00", 
"01:32:00", "01:33:00", "01:34:00", "01:35:00", "01:36:00", "01:37:00", 
"01:38:00", "01:39:00"), AI_1 = c(0.16528026437454, 0.163995204250076, 
0.164581401494747, 0.161694314834269, 0.163917689382925, 0.164007457424482, 
0.165191747536324, 0.16530877672998, 0.164301659628605, 0.16242849405884, 
0.163719381361245, 0.164336918744191, 0.163752563186272, 0.164085843486251, 
0.16378316688897, 0.166833331781526, 0.16429177819724, 0.163654350013046, 
0.162191321708722, 0.164125054995564, 0.163007832586539, 0.164642576607691, 
0.164240497997558, 0.164368040327747, 0.165115677448188, 0.166446115241087, 
0.164364226061044, 0.163980648381964, 0.164217299179925, 0.163697267390534, 
0.165023149913166, 0.164963028992835, 0.164581942410383, 0.164102888598112, 
0.164114359824609, 0.164909199734182, 0.1647723097995, 0.164333631722669, 
0.163217992625673, 0.162930307326649, 0.164019924780975, 0.162721307047624, 
0.164192622368622, 0.164757374919656, 0.164779535457602, 0.165821461484994, 
0.162647621622753, 0.165919323130543, 0.164117042017469, 0.162910189831296, 
0.164258105734382, 0.162312615318485, 0.16428182389575, 0.16213902859448, 
0.162884408334533, 0.162198437052509, 0.165414446056782, 0.162873605934549, 
0.162172656294551, 0.162135582764911, 0.163042752319553, 0.16224644975155, 
0.164009684185798, 0.163110019752022, 0.163379927078646, 0.164012257170658, 
0.164439089427949, 0.16224770327632, 0.164591945565879, 0.164431298827987, 
0.165736844531931, 0.162629472542491, 0.164787476839252, 0.162719814852985, 
0.164527472183818, 0.162039786720132, 0.1645545617745, 0.164705888656581, 
0.16242985788503, 0.16077563386833, 0.164190837236036, 0.164459238508408, 
0.162516082375934, 0.164605728678467, 0.163147995834175, 0.161349837954994, 
0.163897611611084, 0.162861088963398, 0.162403920420443, 0.163557577227182, 
0.164293649590329, 0.161513543124043, 0.160984587132739, 0.162398145567455, 
0.16196526716528, 0.164749564473586, 0.160500524232274, 0.161814085721974, 
0.161727028371301, 0.163082100005441), AI_2 = c(0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), AI_3 = c(0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), AI_4 = c(0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0), AI_5 = c(0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0), AI_6 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0), AI_7 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0), ID = c("ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1")), row.names = c(NA, 100L), class = "data.frame")

Does anyone know how to solve this problem?

CodePudding user response:

Where df is your data set.

library(tidyverse)
thresh <- 0
df %>% 
  pivot_longer(starts_with("AI_")) %>%
  arrange(name, X) %>% 
  filter(value > thresh) %>% 
  group_by(name) %>% 
  mutate(
    q1 = quantile(value, probs = 0.01),
    q99 = quantile(value, probs = 0.99)
  ) %>% 
  filter(value >= q1, value <= q99) %>%
  group_by(name, ID) %>% 
  summarise(sd = sd(value)) %>% 
  ungroup() %>% 
  pivot_wider(
    id_cols = "ID",
    names_from = "name",
    values_from = "sd"
  )
  • Related