This datasets contains 42 IDs with 1440 observations per ID. This means that the dataset got 60480 observations in total. The values of the columns AI_1..., to AI_7 got values that range from 0.00 to approx 3.. (depending on ID). I try to make certain tresholds for each column (AI_1 to AI_7) that gives the value 1 if AI_n < 0 and 0 if AI_n > 0. This values should be entered in a new column called activity_1, ..., activity_7. I did this with the following code which worked:
suppressPackageStartupMessages({
library(dplyr)
library(stringr)
})
Threshold <- AI %>%
select(-starts_with("activity")) %>%
mutate(across(starts_with("AI_"), ~ as.integer(.x <= 0.0), .names = "activity_{col}")) %>%
rename_at(vars(starts_with("activity_AI")), ~ str_remove(., "_AI_"))
What I want is to create the Standard deviation and Variation of the AI_1, ..., AI_7 grouped by ID only for those threshold values that exceed the 0. I.e., where the activity_1, ..., activity_7 was 0. Because the 0 values were the point that were bigger than the value 0. I tried it with the following code but it also takes the 0.0 values into account which I want to exclude:
AI_SD <- Threshold %>% group_by(ID) %>% summarise(across(everything(), sd))
This code makes a new dataframe where the 1440 point are converted to 1 point per ID so now we have 42 observation instead of 60480.
This is the data without the activity thresholds
structure(list(X = 1:20, x1.time = c("00:00:00", "00:01:00",
"00:02:00", "00:03:00", "00:04:00", "00:05:00", "00:06:00", "00:07:00",
"00:08:00", "00:09:00", "00:10:00", "00:11:00", "00:12:00", "00:13:00",
"00:14:00", "00:15:00", "00:16:00", "00:17:00", "00:18:00", "00:19:00"
), AI_1 = c(0.17532896077581, 0.174249939439765, 0.174170544792533,
0.172877357886967, 0.173679017353614, 0.174216799443538, 0.174514454250882,
0.174656389074666, 0.173377175454716, 0.173044040397703, 0.172476572884875,
0.174738790856458, 0.173833445732856, 0.174229265722835, 0.174392878820111,
0.174715890976243, 0.174241614289181, 0.173229751013599, 0.173579164085914,
0.173829069216696), AI_2 = c(0.173549588758752, 0, 0.85729795236214,
0.513925586220723, 0.140789239632585, 0.0989981552300843, 0.321625480480368,
0.62540390366724, 0.00714855410741877, 0, 0, 0, 0.212943798631015,
0, 0, 0.023650258664654, 0.00159158576982517, 0.0172670511608436,
0, 0), AI_3 = c(0.026069149474549, 0.0417747330978121, 0.276687600798659,
0.258591321128928, 0.208790296683244, 0.0300099278967508, 0.15234594700642,
0.26519848659315, 0.34220566727692, 0.352310255219813, 0.297621781376737,
0.292800000618149, 0.481566536382664, 0.337770306519177, 0.743182296874282,
0.256202127993172, 0.201340506649845, 0.200155318345632, 0.237126429055375,
0.234974163009848), AI_4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), AI_5 = c(0, 0, 0.0015062890214412,
0.00154798776365785, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), AI_6 = c(0.190018331633492, 0.241159552783285, 0.231916111803065,
0.193196835220518, 0.240381778378367, 0.266125762332231, 0.339227319507121,
0.354841547583334, 0.277011867279295, 0.474462632995715, 0.516356521276347,
0.559477604383845, 0.374857636694405, 0.376675155204282, 0.516347133869462,
0.627633542885353, 0.565732682034457, 0.544148310829377, 0.545022418887296,
0.602327138107482), AI_7 = c(0.139608768263461, 0.165583663096789,
0.326959508587122, 0.221739297198209, 0.160657663051105, 0.107439748199699,
0.117594125364214, 0.133528520361788, 0.117950354159875, 0.131428192187155,
0.125355403562937, 0.119185646272255, 0.196285453922129, 0.167061057207379,
0.169855099745761, 0.141077126343563, 0.078433720675593, 0.0999303057993443,
0.0798045801131668, 0.0331137028671696), ID = c("ID1", "ID1",
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1",
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1"
)), row.names = c(NA, 20L), class = "data.frame")
This is the data with the activity thresholds
structure(list(X = 1:20, x1.time = c("00:00:00", "00:01:00",
"00:02:00", "00:03:00", "00:04:00", "00:05:00", "00:06:00", "00:07:00",
"00:08:00", "00:09:00", "00:10:00", "00:11:00", "00:12:00", "00:13:00",
"00:14:00", "00:15:00", "00:16:00", "00:17:00", "00:18:00", "00:19:00"
), AI_1 = c(0.17532896077581, 0.174249939439765, 0.174170544792533,
0.172877357886967, 0.173679017353614, 0.174216799443538, 0.174514454250882,
0.174656389074666, 0.173377175454716, 0.173044040397703, 0.172476572884875,
0.174738790856458, 0.173833445732856, 0.174229265722835, 0.174392878820111,
0.174715890976243, 0.174241614289181, 0.173229751013599, 0.173579164085914,
0.173829069216696), AI_2 = c(0.173549588758752, 0, 0.85729795236214,
0.513925586220723, 0.140789239632585, 0.0989981552300843, 0.321625480480368,
0.62540390366724, 0.00714855410741877, 0, 0, 0, 0.212943798631015,
0, 0, 0.023650258664654, 0.00159158576982517, 0.0172670511608436,
0, 0), AI_3 = c(0.026069149474549, 0.0417747330978121, 0.276687600798659,
0.258591321128928, 0.208790296683244, 0.0300099278967508, 0.15234594700642,
0.26519848659315, 0.34220566727692, 0.352310255219813, 0.297621781376737,
0.292800000618149, 0.481566536382664, 0.337770306519177, 0.743182296874282,
0.256202127993172, 0.201340506649845, 0.200155318345632, 0.237126429055375,
0.234974163009848), AI_4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), AI_5 = c(0, 0, 0.0015062890214412,
0.00154798776365785, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), AI_6 = c(0.190018331633492, 0.241159552783285, 0.231916111803065,
0.193196835220518, 0.240381778378367, 0.266125762332231, 0.339227319507121,
0.354841547583334, 0.277011867279295, 0.474462632995715, 0.516356521276347,
0.559477604383845, 0.374857636694405, 0.376675155204282, 0.516347133869462,
0.627633542885353, 0.565732682034457, 0.544148310829377, 0.545022418887296,
0.602327138107482), AI_7 = c(0.139608768263461, 0.165583663096789,
0.326959508587122, 0.221739297198209, 0.160657663051105, 0.107439748199699,
0.117594125364214, 0.133528520361788, 0.117950354159875, 0.131428192187155,
0.125355403562937, 0.119185646272255, 0.196285453922129, 0.167061057207379,
0.169855099745761, 0.141077126343563, 0.078433720675593, 0.0999303057993443,
0.0798045801131668, 0.0331137028671696), ID = c("ID1", "ID1",
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1",
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1"
), activity1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), activity2 = c(0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L,
1L), activity3 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), activity4 = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), activity5 = c(1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), activity6 = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), activity7 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(NA, 20L), class = "data.frame")
**The question is: how can I find the variance or standard deviation grouped by ID only for those values that have the value 0 (i.e., exceeds the threshold of 0)**
CodePudding user response:
You can limit the call to sd
to only those elements of .x
where .x
exceeds 0.
data %>%
group_by(ID) %>%
summarize(across(starts_with("AI"),~sd(.x[.x>0])))
Output:
ID AI_1 AI_2 AI_3 AI_4 AI_5 AI_6 AI_7
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ID1 0.000715 0.278 0.160 NA 0.0000295 0.150 0.0610