Home > Software engineering >  Find the variance based on a threshold in R
Find the variance based on a threshold in R

Time:10-13

This datasets contains 42 IDs with 1440 observations per ID. This means that the dataset got 60480 observations in total. The values of the columns AI_1..., to AI_7 got values that range from 0.00 to approx 3.. (depending on ID). I try to make certain tresholds for each column (AI_1 to AI_7) that gives the value 1 if AI_n < 0 and 0 if AI_n > 0. This values should be entered in a new column called activity_1, ..., activity_7. I did this with the following code which worked:

suppressPackageStartupMessages({
  library(dplyr)
  library(stringr)
})

Threshold <- AI %>%
  select(-starts_with("activity")) %>%
  mutate(across(starts_with("AI_"), ~ as.integer(.x <= 0.0), .names = "activity_{col}")) %>%
  rename_at(vars(starts_with("activity_AI")), ~ str_remove(., "_AI_"))

What I want is to create the Standard deviation and Variation of the AI_1, ..., AI_7 grouped by ID only for those threshold values that exceed the 0. I.e., where the activity_1, ..., activity_7 was 0. Because the 0 values were the point that were bigger than the value 0. I tried it with the following code but it also takes the 0.0 values into account which I want to exclude:

AI_SD <- Threshold %>% group_by(ID) %>% summarise(across(everything(), sd))

This code makes a new dataframe where the 1440 point are converted to 1 point per ID so now we have 42 observation instead of 60480.

This is the data without the activity thresholds

structure(list(X = 1:20, x1.time = c("00:00:00", "00:01:00", 
"00:02:00", "00:03:00", "00:04:00", "00:05:00", "00:06:00", "00:07:00", 
"00:08:00", "00:09:00", "00:10:00", "00:11:00", "00:12:00", "00:13:00", 
"00:14:00", "00:15:00", "00:16:00", "00:17:00", "00:18:00", "00:19:00"
), AI_1 = c(0.17532896077581, 0.174249939439765, 0.174170544792533, 
0.172877357886967, 0.173679017353614, 0.174216799443538, 0.174514454250882, 
0.174656389074666, 0.173377175454716, 0.173044040397703, 0.172476572884875, 
0.174738790856458, 0.173833445732856, 0.174229265722835, 0.174392878820111, 
0.174715890976243, 0.174241614289181, 0.173229751013599, 0.173579164085914, 
0.173829069216696), AI_2 = c(0.173549588758752, 0, 0.85729795236214, 
0.513925586220723, 0.140789239632585, 0.0989981552300843, 0.321625480480368, 
0.62540390366724, 0.00714855410741877, 0, 0, 0, 0.212943798631015, 
0, 0, 0.023650258664654, 0.00159158576982517, 0.0172670511608436, 
0, 0), AI_3 = c(0.026069149474549, 0.0417747330978121, 0.276687600798659, 
0.258591321128928, 0.208790296683244, 0.0300099278967508, 0.15234594700642, 
0.26519848659315, 0.34220566727692, 0.352310255219813, 0.297621781376737, 
0.292800000618149, 0.481566536382664, 0.337770306519177, 0.743182296874282, 
0.256202127993172, 0.201340506649845, 0.200155318345632, 0.237126429055375, 
0.234974163009848), AI_4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0), AI_5 = c(0, 0, 0.0015062890214412, 
0.00154798776365785, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0), AI_6 = c(0.190018331633492, 0.241159552783285, 0.231916111803065, 
0.193196835220518, 0.240381778378367, 0.266125762332231, 0.339227319507121, 
0.354841547583334, 0.277011867279295, 0.474462632995715, 0.516356521276347, 
0.559477604383845, 0.374857636694405, 0.376675155204282, 0.516347133869462, 
0.627633542885353, 0.565732682034457, 0.544148310829377, 0.545022418887296, 
0.602327138107482), AI_7 = c(0.139608768263461, 0.165583663096789, 
0.326959508587122, 0.221739297198209, 0.160657663051105, 0.107439748199699, 
0.117594125364214, 0.133528520361788, 0.117950354159875, 0.131428192187155, 
0.125355403562937, 0.119185646272255, 0.196285453922129, 0.167061057207379, 
0.169855099745761, 0.141077126343563, 0.078433720675593, 0.0999303057993443, 
0.0798045801131668, 0.0331137028671696), ID = c("ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1"
)), row.names = c(NA, 20L), class = "data.frame")

This is the data with the activity thresholds

structure(list(X = 1:20, x1.time = c("00:00:00", "00:01:00", 
"00:02:00", "00:03:00", "00:04:00", "00:05:00", "00:06:00", "00:07:00", 
"00:08:00", "00:09:00", "00:10:00", "00:11:00", "00:12:00", "00:13:00", 
"00:14:00", "00:15:00", "00:16:00", "00:17:00", "00:18:00", "00:19:00"
), AI_1 = c(0.17532896077581, 0.174249939439765, 0.174170544792533, 
0.172877357886967, 0.173679017353614, 0.174216799443538, 0.174514454250882, 
0.174656389074666, 0.173377175454716, 0.173044040397703, 0.172476572884875, 
0.174738790856458, 0.173833445732856, 0.174229265722835, 0.174392878820111, 
0.174715890976243, 0.174241614289181, 0.173229751013599, 0.173579164085914, 
0.173829069216696), AI_2 = c(0.173549588758752, 0, 0.85729795236214, 
0.513925586220723, 0.140789239632585, 0.0989981552300843, 0.321625480480368, 
0.62540390366724, 0.00714855410741877, 0, 0, 0, 0.212943798631015, 
0, 0, 0.023650258664654, 0.00159158576982517, 0.0172670511608436, 
0, 0), AI_3 = c(0.026069149474549, 0.0417747330978121, 0.276687600798659, 
0.258591321128928, 0.208790296683244, 0.0300099278967508, 0.15234594700642, 
0.26519848659315, 0.34220566727692, 0.352310255219813, 0.297621781376737, 
0.292800000618149, 0.481566536382664, 0.337770306519177, 0.743182296874282, 
0.256202127993172, 0.201340506649845, 0.200155318345632, 0.237126429055375, 
0.234974163009848), AI_4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0), AI_5 = c(0, 0, 0.0015062890214412, 
0.00154798776365785, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0), AI_6 = c(0.190018331633492, 0.241159552783285, 0.231916111803065, 
0.193196835220518, 0.240381778378367, 0.266125762332231, 0.339227319507121, 
0.354841547583334, 0.277011867279295, 0.474462632995715, 0.516356521276347, 
0.559477604383845, 0.374857636694405, 0.376675155204282, 0.516347133869462, 
0.627633542885353, 0.565732682034457, 0.544148310829377, 0.545022418887296, 
0.602327138107482), AI_7 = c(0.139608768263461, 0.165583663096789, 
0.326959508587122, 0.221739297198209, 0.160657663051105, 0.107439748199699, 
0.117594125364214, 0.133528520361788, 0.117950354159875, 0.131428192187155, 
0.125355403562937, 0.119185646272255, 0.196285453922129, 0.167061057207379, 
0.169855099745761, 0.141077126343563, 0.078433720675593, 0.0999303057993443, 
0.0798045801131668, 0.0331137028671696), ID = c("ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1"
), activity1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), activity2 = c(0L, 1L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 
1L), activity3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), activity4 = c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L), activity5 = c(1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), activity6 = c(0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L), activity7 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(NA, 20L), class = "data.frame")

**The question is: how can I find the variance or standard deviation grouped by ID only for those values that have the value 0 (i.e., exceeds the threshold of 0)**

CodePudding user response:

You can limit the call to sd to only those elements of .x where .x exceeds 0.

data %>% 
  group_by(ID) %>% 
  summarize(across(starts_with("AI"),~sd(.x[.x>0])))

Output:

  ID        AI_1  AI_2  AI_3  AI_4      AI_5  AI_6   AI_7
  <chr>    <dbl> <dbl> <dbl> <dbl>     <dbl> <dbl>  <dbl>
1 ID1   0.000715 0.278 0.160    NA 0.0000295 0.150 0.0610
  • Related