Home > Enterprise >  Create a binary variable based on a threshold in R
Create a binary variable based on a threshold in R

Time:10-10

The following dataset contains 7 columns (i.e., AI_1 until AI_7) that have 1440 observations per ID (in total 42 IDs). I want to create a dataset that makes a binary variable of each AI based on a threshold. For example if AI_1 > 0,1 it should get the value 1 in a new variable called ACTIVITY otherwise the value 0 in the same variable ACTIVITY. I tried this with the following code but when I try to find the mean value of the binary variable it indicates that the mean is above 1.. which is curious since it can only take the value of either 0 or 1. So does anyone know how to make 7 of these binary variables in the same dataset where the mean is between 0 and 1?

structure(list(X = 1:30, x1.time = c("00:00:00", "00:01:00", 
"00:02:00", "00:03:00", "00:04:00", "00:05:00", "00:06:00", "00:07:00", 
"00:08:00", "00:09:00", "00:10:00", "00:11:00", "00:12:00", "00:13:00", 
"00:14:00", "00:15:00", "00:16:00", "00:17:00", "00:18:00", "00:19:00", 
"00:20:00", "00:21:00", "00:22:00", "00:23:00", "00:24:00", "00:25:00", 
"00:26:00", "00:27:00", "00:28:00", "00:29:00"), AI_1 = c(0.17532896077581, 
0.174249939439765, 0.174170544792533, 0.172877357886967, 0.173679017353614, 
0.174216799443538, 0.174514454250882, 0.174656389074666, 0.173377175454716, 
0.173044040397703, 0.172476572884875, 0.174738790856458, 0.173833445732856, 
0.174229265722835, 0.174392878820111, 0.174715890976243, 0.174241614289181, 
0.173229751013599, 0.173579164085914, 0.173829069216696, 0.173499039975341, 
0.174387946222767, 0.173802854581089, 0.174107580137568, 0.174113709936873, 
0.173172609295233, 0.174509255493075, 0.173383120975257, 0.173398927511582, 
0.173466516952908), AI_2 = c(0.173549588758752, 0, 0.85729795236214, 
0.513925586220723, 0.140789239632585, 0.0989981552300843, 0.321625480480368, 
0.62540390366724, 0.00714855410741877, 0, 0, 0, 0.212943798631015, 
0, 0, 0.023650258664654, 0.00159158576982517, 0.0172670511608436, 
0, 0, 0, 0.25653572767355, 0.41158598021939, 0.433889173147664, 
0.442200975044019, 0.471931171507954, 0.415009919603445, 0.43364443321512, 
0.449930874231746, 0.48397633182816), AI_3 = c(0.026069149474549, 
0.0417747330978121, 0.276687600798659, 0.258591321128928, 0.208790296683244, 
0.0300099278967508, 0.15234594700642, 0.26519848659315, 0.34220566727692, 
0.352310255219813, 0.297621781376737, 0.292800000618149, 0.481566536382664, 
0.337770306519177, 0.743182296874282, 0.256202127993172, 0.201340506649845, 
0.200155318345632, 0.237126429055375, 0.234974163009848, 0.235808994849961, 
0.302168675921402, 0.377936665388589, 0.416123299239618, 0.389279883023212, 
0.357972848973051, 0.305268847437493, 0.290040891577408, 0.197384083463156, 
0.258282654013295), AI_4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.00841646877382803, 
0), AI_5 = c(0, 0, 0.0015062890214412, 0.00154798776365785, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0), AI_6 = c(0.190018331633492, 0.241159552783285, 0.231916111803065, 
0.193196835220518, 0.240381778378367, 0.266125762332231, 0.339227319507121, 
0.354841547583334, 0.277011867279295, 0.474462632995715, 0.516356521276347, 
0.559477604383845, 0.374857636694405, 0.376675155204282, 0.516347133869462, 
0.627633542885353, 0.565732682034457, 0.544148310829377, 0.545022418887296, 
0.602327138107482, 0.529578366594453, 0.571672817412653, 0.51963881197827, 
0.493590581088222, 0.487545798153711, 0.525272191616523, 0.586906227102549, 
0.555446579214151, 0.578788883825157, 0.617822898150646), AI_7 = c(0.139608768263461, 
0.165583663096789, 0.326959508587122, 0.221739297198209, 0.160657663051105, 
0.107439748199699, 0.117594125364214, 0.133528520361788, 0.117950354159875, 
0.131428192187155, 0.125355403562937, 0.119185646272255, 0.196285453922129, 
0.167061057207379, 0.169855099745761, 0.141077126343563, 0.078433720675593, 
0.0999303057993443, 0.0798045801131668, 0.0331137028671696, 0.0920945831761988, 
0.0233052285173748, 0, 0, 0, 0.00876293044107867, 0, 0.109134564970416, 
0.110323312017635, 0.117772975747077), ID = c("ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1"
), activity = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0), activity2 = c("0", 
"1", "0", "0", "0", "1", "0", "0", "1", "1", "1", "1", "0", "1", 
"1", "1", "1", "1", "1", "1", "1", "0", "0", "0", "0", "0", "0", 
"0", "0", "0"), activity3 = c("1", "1", "0", "0", "0", "1", "0", 
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", 
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0"), activity4 = c("1", 
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", 
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", 
"1", "1", "1"), activity5 = c("1", "1", "1", "1", "1", "1", "1", 
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", 
"1", "1", "1", "1", "1", "1", "1", "1", "1", "1"), activity6 = c("0", 
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", 
"0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", 
"0", "0", "0"), activity7 = c("0", "0", "0", "0", "0", "0", "0", 
"0", "0", "0", "0", "0", "0", "0", "0", "0", "1", "1", "1", "1", 
"1", "1", "1", "1", "1", "1", "1", "0", "0", "0")), row.names = c(NA, 
30L), class = "data.frame")

This is the code I used

    Threshold <- Activity_index_1 %>%
                  mutate(activity = case_when(
                          AI_1 <= 0.1 ~ "1",
                          AI_1 > 0.1 ~ "0",
                          
                  ))
Threshold2 <-  Threshold %>%
                  mutate(activity2 = case_when(
                          AI_2 <= 0.1 ~ "1",
                          AI_2 > 0.1 ~ "0",
                          
                  ))
Threshold3 <- Threshold2 %>%
                  mutate(activity3 = case_when(
                          AI_3 <= 0.1 ~ "1",
                          AI_3 > 0.1 ~ "0",
                          
                  ))
Threshold4 <- Threshold3 %>%
                  mutate(activity4 = case_when(
                          AI_4 <= 0.1 ~ "1",
                          AI_4 > 0.1 ~ "0",
                          
                  ))
Threshold5 <- Threshold4 %>%
                  mutate(activity5 = case_when(
                          AI_5 <= 0.1 ~ "1",
                          AI_5 > 0.1 ~ "0",
                          
                  ))
Threshold6 <- Threshold5 %>%
                  mutate(activity6 = case_when(
                          AI_6 <= 0.1 ~ "1",
                          AI_6 > 0.1 ~ "0",
                          
                  ))
Threshold7 <- Threshold6 %>%
                  mutate(activity7 = case_when(
                          AI_7 <= 0.1 ~ "1",
                          AI_7 > 0.1 ~ "0",
                          
                  ))

CodePudding user response:

Here is a solution with mutate/across and a logical condition returning FALSE/TRUE then coerced to integers 0/1.

The posted data already has columns activity so I start by removing them from the data.

suppressPackageStartupMessages({
  library(dplyr)
  library(stringr)
})

Threshold <- Activity_index_1 %>%
  select(-starts_with("activity")) %>%
  mutate(across(starts_with("AI_"), ~ as.integer(.x <= 0.1), .names = "activity_{col}")) %>%
  rename_at(vars(starts_with("activity_AI")), ~ str_remove(., "_AI_"))

str(Threshold)
#> 'data.frame':    30 obs. of  17 variables:
#>  $ X        : int  1 2 3 4 5 6 7 8 9 10 ...
#>  $ x1.time  : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
#>  $ AI_1     : num  0.175 0.174 0.174 0.173 0.174 ...
#>  $ AI_2     : num  0.174 0 0.857 0.514 0.141 ...
#>  $ AI_3     : num  0.0261 0.0418 0.2767 0.2586 0.2088 ...
#>  $ AI_4     : num  0 0 0 0 0 0 0 0 0 0 ...
#>  $ AI_5     : num  0 0 0.00151 0.00155 0 ...
#>  $ AI_6     : num  0.19 0.241 0.232 0.193 0.24 ...
#>  $ AI_7     : num  0.14 0.166 0.327 0.222 0.161 ...
#>  $ ID       : chr  "ID1" "ID1" "ID1" "ID1" ...
#>  $ activity1: int  0 0 0 0 0 0 0 0 0 0 ...
#>  $ activity2: int  0 1 0 0 0 1 0 0 1 1 ...
#>  $ activity3: int  1 1 0 0 0 1 0 0 0 0 ...
#>  $ activity4: int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ activity5: int  1 1 1 1 1 1 1 1 1 1 ...
#>  $ activity6: int  0 0 0 0 0 0 0 0 0 0 ...
#>  $ activity7: int  0 0 0 0 0 0 0 0 0 0 ...

Created on 2022-10-10 with reprex v2.0.2

CodePudding user response:

Comparing just AI variables with .1, convert to numeric, set colnames and cbind.

res <- cbind(dat, ((dat[grep('^AI', names(dat))] <= .1)^1) |>
               {\(.) `colnames<-`(., gsub('AI', 'activity', colnames(.)))}())

str(res)
# 'data.frame': 30 obs. of  16 variables:
# $ x1.time   : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
# $ AI_1      : num  0.175 0.174 0.174 0.173 0.174 ...
# $ AI_2      : num  0.174 0 0.857 0.514 0.141 ...
# $ AI_3      : num  0.0261 0.0418 0.2767 0.2586 0.2088 ...
# $ AI_4      : num  0 0 0 0 0 0 0 0 0 0 ...
# $ AI_5      : num  0 0 0.00151 0.00155 0 ...
# $ AI_6      : num  0.19 0.241 0.232 0.193 0.24 ...
# $ AI_7      : num  0.14 0.166 0.327 0.222 0.161 ...
# $ ID        : chr  "ID1" "ID1" "ID1" "ID1" ...
# $ activity_1: num  0 0 0 0 0 0 0 0 0 0 ...
# $ activity_2: num  0 1 0 0 0 1 0 0 1 1 ...
# $ activity_3: num  1 1 0 0 0 1 0 0 0 0 ...
# $ activity_4: num  1 1 1 1 1 1 1 1 1 1 ...
# $ activity_5: num  1 1 1 1 1 1 1 1 1 1 ...
# $ activity_6: num  0 0 0 0 0 0 0 0 0 0 ...
# $ activity_7: num  0 0 0 0 0 0 0 0 0 0 ...

dat <- structure(list(x1.time = c("00:00:00", "00:01:00", "00:02:00", 
"00:03:00", "00:04:00", "00:05:00", "00:06:00", "00:07:00", "00:08:00", 
"00:09:00", "00:10:00", "00:11:00", "00:12:00", "00:13:00", "00:14:00", 
"00:15:00", "00:16:00", "00:17:00", "00:18:00", "00:19:00", "00:20:00", 
"00:21:00", "00:22:00", "00:23:00", "00:24:00", "00:25:00", "00:26:00", 
"00:27:00", "00:28:00", "00:29:00"), AI_1 = c(0.17532896077581, 
0.174249939439765, 0.174170544792533, 0.172877357886967, 0.173679017353614, 
0.174216799443538, 0.174514454250882, 0.174656389074666, 0.173377175454716, 
0.173044040397703, 0.172476572884875, 0.174738790856458, 0.173833445732856, 
0.174229265722835, 0.174392878820111, 0.174715890976243, 0.174241614289181, 
0.173229751013599, 0.173579164085914, 0.173829069216696, 0.173499039975341, 
0.174387946222767, 0.173802854581089, 0.174107580137568, 0.174113709936873, 
0.173172609295233, 0.174509255493075, 0.173383120975257, 0.173398927511582, 
0.173466516952908), AI_2 = c(0.173549588758752, 0, 0.85729795236214, 
0.513925586220723, 0.140789239632585, 0.0989981552300843, 0.321625480480368, 
0.62540390366724, 0.00714855410741877, 0, 0, 0, 0.212943798631015, 
0, 0, 0.023650258664654, 0.00159158576982517, 0.0172670511608436, 
0, 0, 0, 0.25653572767355, 0.41158598021939, 0.433889173147664, 
0.442200975044019, 0.471931171507954, 0.415009919603445, 0.43364443321512, 
0.449930874231746, 0.48397633182816), AI_3 = c(0.026069149474549, 
0.0417747330978121, 0.276687600798659, 0.258591321128928, 0.208790296683244, 
0.0300099278967508, 0.15234594700642, 0.26519848659315, 0.34220566727692, 
0.352310255219813, 0.297621781376737, 0.292800000618149, 0.481566536382664, 
0.337770306519177, 0.743182296874282, 0.256202127993172, 0.201340506649845, 
0.200155318345632, 0.237126429055375, 0.234974163009848, 0.235808994849961, 
0.302168675921402, 0.377936665388589, 0.416123299239618, 0.389279883023212, 
0.357972848973051, 0.305268847437493, 0.290040891577408, 0.197384083463156, 
0.258282654013295), AI_4 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.00841646877382803, 
0), AI_5 = c(0, 0, 0.0015062890214412, 0.00154798776365785, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0), AI_6 = c(0.190018331633492, 0.241159552783285, 0.231916111803065, 
0.193196835220518, 0.240381778378367, 0.266125762332231, 0.339227319507121, 
0.354841547583334, 0.277011867279295, 0.474462632995715, 0.516356521276347, 
0.559477604383845, 0.374857636694405, 0.376675155204282, 0.516347133869462, 
0.627633542885353, 0.565732682034457, 0.544148310829377, 0.545022418887296, 
0.602327138107482, 0.529578366594453, 0.571672817412653, 0.51963881197827, 
0.493590581088222, 0.487545798153711, 0.525272191616523, 0.586906227102549, 
0.555446579214151, 0.578788883825157, 0.617822898150646), AI_7 = c(0.139608768263461, 
0.165583663096789, 0.326959508587122, 0.221739297198209, 0.160657663051105, 
0.107439748199699, 0.117594125364214, 0.133528520361788, 0.117950354159875, 
0.131428192187155, 0.125355403562937, 0.119185646272255, 0.196285453922129, 
0.167061057207379, 0.169855099745761, 0.141077126343563, 0.078433720675593, 
0.0999303057993443, 0.0798045801131668, 0.0331137028671696, 0.0920945831761988, 
0.0233052285173748, 0, 0, 0, 0.00876293044107867, 0, 0.109134564970416, 
0.110323312017635, 0.117772975747077), ID = c("ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", 
"ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1", "ID1"
)), row.names = c(NA, 30L), class = "data.frame")
  • Related